i965/skl: Add a message header for the TXF_MCS instruction in vec4vs
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (strncmp(ir->name, storage->name, namelen) != 0 ||
690 (storage->name[namelen] != 0 &&
691 storage->name[namelen] != '.' &&
692 storage->name[namelen] != '[')) {
693 continue;
694 }
695
696 gl_constant_value *components = storage->storage;
697 unsigned vector_count = (MAX2(storage->array_elements, 1) *
698 storage->type->matrix_columns);
699
700 for (unsigned s = 0; s < vector_count; s++) {
701 assert(uniforms < uniform_array_size);
702 uniform_vector_size[uniforms] = storage->type->vector_elements;
703
704 int i;
705 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
706 stage_prog_data->param[uniforms * 4 + i] = components;
707 components++;
708 }
709 for (; i < 4; i++) {
710 static gl_constant_value zero = { 0.0 };
711 stage_prog_data->param[uniforms * 4 + i] = &zero;
712 }
713
714 uniforms++;
715 }
716 }
717 }
718
719 void
720 vec4_visitor::setup_uniform_clipplane_values()
721 {
722 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
723
724 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 4;
727 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
728 this->userplane[i].type = BRW_REGISTER_TYPE_F;
729 for (int j = 0; j < 4; ++j) {
730 stage_prog_data->param[this->uniforms * 4 + j] =
731 (gl_constant_value *) &clip_planes[i][j];
732 }
733 ++this->uniforms;
734 }
735 }
736
737 /* Our support for builtin uniforms is even scarier than non-builtin.
738 * It sits on top of the PROG_STATE_VAR parameters that are
739 * automatically updated from GL context state.
740 */
741 void
742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
743 {
744 const ir_state_slot *const slots = ir->get_state_slots();
745 assert(slots != NULL);
746
747 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
748 /* This state reference has already been setup by ir_to_mesa,
749 * but we'll get the same index back here. We can reference
750 * ParameterValues directly, since unlike brw_fs.cpp, we never
751 * add new state references during compile.
752 */
753 int index = _mesa_add_state_reference(this->prog->Parameters,
754 (gl_state_index *)slots[i].tokens);
755 gl_constant_value *values =
756 &this->prog->Parameters->ParameterValues[index][0];
757
758 assert(this->uniforms < uniform_array_size);
759
760 for (unsigned j = 0; j < 4; j++)
761 stage_prog_data->param[this->uniforms * 4 + j] =
762 &values[GET_SWZ(slots[i].swizzle, j)];
763
764 this->uniform_vector_size[this->uniforms] =
765 (ir->type->is_scalar() || ir->type->is_vector() ||
766 ir->type->is_matrix() ? ir->type->vector_elements : 4);
767
768 this->uniforms++;
769 }
770 }
771
772 dst_reg *
773 vec4_visitor::variable_storage(ir_variable *var)
774 {
775 return (dst_reg *)hash_table_find(this->variable_ht, var);
776 }
777
778 void
779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
780 enum brw_predicate *predicate)
781 {
782 ir_expression *expr = ir->as_expression();
783
784 *predicate = BRW_PREDICATE_NORMAL;
785
786 if (expr && expr->operation != ir_binop_ubo_load) {
787 src_reg op[3];
788 vec4_instruction *inst;
789
790 assert(expr->get_num_operands() <= 3);
791 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
792 expr->operands[i]->accept(this);
793 op[i] = this->result;
794
795 resolve_ud_negate(&op[i]);
796 }
797
798 switch (expr->operation) {
799 case ir_unop_logic_not:
800 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
801 inst->conditional_mod = BRW_CONDITIONAL_Z;
802 break;
803
804 case ir_binop_logic_xor:
805 if (devinfo->gen <= 5) {
806 src_reg temp = src_reg(this, ir->type);
807 emit(XOR(dst_reg(temp), op[0], op[1]));
808 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
809 } else {
810 inst = emit(XOR(dst_null_d(), op[0], op[1]));
811 }
812 inst->conditional_mod = BRW_CONDITIONAL_NZ;
813 break;
814
815 case ir_binop_logic_or:
816 if (devinfo->gen <= 5) {
817 src_reg temp = src_reg(this, ir->type);
818 emit(OR(dst_reg(temp), op[0], op[1]));
819 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
820 } else {
821 inst = emit(OR(dst_null_d(), op[0], op[1]));
822 }
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_binop_logic_and:
827 if (devinfo->gen <= 5) {
828 src_reg temp = src_reg(this, ir->type);
829 emit(AND(dst_reg(temp), op[0], op[1]));
830 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
831 } else {
832 inst = emit(AND(dst_null_d(), op[0], op[1]));
833 }
834 inst->conditional_mod = BRW_CONDITIONAL_NZ;
835 break;
836
837 case ir_unop_f2b:
838 if (devinfo->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_f(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_unop_i2b:
847 if (devinfo->gen >= 6) {
848 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
849 } else {
850 inst = emit(MOV(dst_null_d(), op[0]));
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 }
853 break;
854
855 case ir_binop_all_equal:
856 if (devinfo->gen <= 5) {
857 resolve_bool_comparison(expr->operands[0], &op[0]);
858 resolve_bool_comparison(expr->operands[1], &op[1]);
859 }
860 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
861 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
862 break;
863
864 case ir_binop_any_nequal:
865 if (devinfo->gen <= 5) {
866 resolve_bool_comparison(expr->operands[0], &op[0]);
867 resolve_bool_comparison(expr->operands[1], &op[1]);
868 }
869 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
870 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
871 break;
872
873 case ir_unop_any:
874 if (devinfo->gen <= 5) {
875 resolve_bool_comparison(expr->operands[0], &op[0]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
878 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
879 break;
880
881 case ir_binop_greater:
882 case ir_binop_gequal:
883 case ir_binop_less:
884 case ir_binop_lequal:
885 case ir_binop_equal:
886 case ir_binop_nequal:
887 if (devinfo->gen <= 5) {
888 resolve_bool_comparison(expr->operands[0], &op[0]);
889 resolve_bool_comparison(expr->operands[1], &op[1]);
890 }
891 emit(CMP(dst_null_d(), op[0], op[1],
892 brw_conditional_for_comparison(expr->operation)));
893 break;
894
895 case ir_triop_csel: {
896 /* Expand the boolean condition into the flag register. */
897 inst = emit(MOV(dst_null_d(), op[0]));
898 inst->conditional_mod = BRW_CONDITIONAL_NZ;
899
900 /* Select which boolean to return. */
901 dst_reg temp(this, expr->operands[1]->type);
902 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
903 inst->predicate = BRW_PREDICATE_NORMAL;
904
905 /* Expand the result to a condition code. */
906 inst = emit(MOV(dst_null_d(), src_reg(temp)));
907 inst->conditional_mod = BRW_CONDITIONAL_NZ;
908 break;
909 }
910
911 default:
912 unreachable("not reached");
913 }
914 return;
915 }
916
917 ir->accept(this);
918
919 resolve_ud_negate(&this->result);
920
921 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923 }
924
925 /**
926 * Emit a gen6 IF statement with the comparison folded into the IF
927 * instruction.
928 */
929 void
930 vec4_visitor::emit_if_gen6(ir_if *ir)
931 {
932 ir_expression *expr = ir->condition->as_expression();
933
934 if (expr && expr->operation != ir_binop_ubo_load) {
935 src_reg op[3];
936 dst_reg temp;
937
938 assert(expr->get_num_operands() <= 3);
939 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
940 expr->operands[i]->accept(this);
941 op[i] = this->result;
942 }
943
944 switch (expr->operation) {
945 case ir_unop_logic_not:
946 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
947 return;
948
949 case ir_binop_logic_xor:
950 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
951 return;
952
953 case ir_binop_logic_or:
954 temp = dst_reg(this, glsl_type::bool_type);
955 emit(OR(temp, op[0], op[1]));
956 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958
959 case ir_binop_logic_and:
960 temp = dst_reg(this, glsl_type::bool_type);
961 emit(AND(temp, op[0], op[1]));
962 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
963 return;
964
965 case ir_unop_f2b:
966 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_i2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_binop_greater:
974 case ir_binop_gequal:
975 case ir_binop_less:
976 case ir_binop_lequal:
977 case ir_binop_equal:
978 case ir_binop_nequal:
979 emit(IF(op[0], op[1],
980 brw_conditional_for_comparison(expr->operation)));
981 return;
982
983 case ir_binop_all_equal:
984 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
985 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
986 return;
987
988 case ir_binop_any_nequal:
989 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
990 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
991 return;
992
993 case ir_unop_any:
994 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
995 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
996 return;
997
998 case ir_triop_csel: {
999 /* Expand the boolean condition into the flag register. */
1000 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003 /* Select which boolean to return. */
1004 dst_reg temp(this, expr->operands[1]->type);
1005 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006 inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009 return;
1010 }
1011
1012 default:
1013 unreachable("not reached");
1014 }
1015 return;
1016 }
1017
1018 ir->condition->accept(this);
1019
1020 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026 dst_reg *reg = NULL;
1027
1028 if (variable_storage(ir))
1029 return;
1030
1031 switch (ir->data.mode) {
1032 case ir_var_shader_in:
1033 assert(ir->data.location != -1);
1034 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035 break;
1036
1037 case ir_var_shader_out:
1038 assert(ir->data.location != -1);
1039 reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041 for (int i = 0; i < type_size(ir->type); i++) {
1042 output_reg[ir->data.location + i] = *reg;
1043 output_reg[ir->data.location + i].reg_offset = i;
1044 output_reg[ir->data.location + i].type =
1045 brw_type_for_base_type(ir->type->get_scalar_type());
1046 output_reg_annotation[ir->data.location + i] = ir->name;
1047 }
1048 break;
1049
1050 case ir_var_auto:
1051 case ir_var_temporary:
1052 reg = new(mem_ctx) dst_reg(this, ir->type);
1053 break;
1054
1055 case ir_var_uniform:
1056 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058 /* Thanks to the lower_ubo_reference pass, we will see only
1059 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060 * variables, so no need for them to be in variable_ht.
1061 *
1062 * Some uniforms, such as samplers and atomic counters, have no actual
1063 * storage, so we should ignore them.
1064 */
1065 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066 return;
1067
1068 /* Track how big the whole uniform variable is, in case we need to put a
1069 * copy of its data into pull constants for array access.
1070 */
1071 assert(this->uniforms < uniform_array_size);
1072 this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074 if (!strncmp(ir->name, "gl_", 3)) {
1075 setup_builtin_uniform_values(ir);
1076 } else {
1077 setup_uniform_values(ir);
1078 }
1079 break;
1080
1081 case ir_var_system_value:
1082 reg = make_reg_for_system_value(ir);
1083 break;
1084
1085 default:
1086 unreachable("not reached");
1087 }
1088
1089 reg->type = brw_type_for_base_type(ir->type);
1090 hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096 /* We don't want debugging output to print the whole body of the
1097 * loop as the annotation.
1098 */
1099 this->base_ir = NULL;
1100
1101 emit(BRW_OPCODE_DO);
1102
1103 visit_instructions(&ir->body_instructions);
1104
1105 emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111 switch (ir->mode) {
1112 case ir_loop_jump::jump_break:
1113 emit(BRW_OPCODE_BREAK);
1114 break;
1115 case ir_loop_jump::jump_continue:
1116 emit(BRW_OPCODE_CONTINUE);
1117 break;
1118 }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125 unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131 /* Ignore function bodies other than main() -- we shouldn't see calls to
1132 * them since they should all be inlined.
1133 */
1134 if (strcmp(ir->name, "main") == 0) {
1135 const ir_function_signature *sig;
1136 exec_list empty;
1137
1138 sig = ir->matching_signature(NULL, &empty, false);
1139
1140 assert(sig);
1141
1142 visit_instructions(&sig->body);
1143 }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149 /* 3-src instructions were introduced in gen6. */
1150 if (devinfo->gen < 6)
1151 return false;
1152
1153 /* MAD can only handle floating-point data. */
1154 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155 return false;
1156
1157 ir_rvalue *nonmul;
1158 ir_expression *mul;
1159 bool mul_negate, mul_abs;
1160
1161 for (int i = 0; i < 2; i++) {
1162 mul_negate = false;
1163 mul_abs = false;
1164
1165 mul = ir->operands[i]->as_expression();
1166 nonmul = ir->operands[1 - i];
1167
1168 if (mul && mul->operation == ir_unop_abs) {
1169 mul = mul->operands[0]->as_expression();
1170 mul_abs = true;
1171 } else if (mul && mul->operation == ir_unop_neg) {
1172 mul = mul->operands[0]->as_expression();
1173 mul_negate = true;
1174 }
1175
1176 if (mul && mul->operation == ir_binop_mul)
1177 break;
1178 }
1179
1180 if (!mul || mul->operation != ir_binop_mul)
1181 return false;
1182
1183 nonmul->accept(this);
1184 src_reg src0 = fix_3src_operand(this->result);
1185
1186 mul->operands[0]->accept(this);
1187 src_reg src1 = fix_3src_operand(this->result);
1188 src1.negate ^= mul_negate;
1189 src1.abs = mul_abs;
1190 if (mul_abs)
1191 src1.negate = false;
1192
1193 mul->operands[1]->accept(this);
1194 src_reg src2 = fix_3src_operand(this->result);
1195 src2.abs = mul_abs;
1196 if (mul_abs)
1197 src2.negate = false;
1198
1199 this->result = src_reg(this, ir->type);
1200 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202 return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208 /* This optimization relies on CMP setting the destination to 0 when
1209 * false. Early hardware only sets the least significant bit, and
1210 * leaves the other bits undefined. So we can't use it.
1211 */
1212 if (devinfo->gen < 6)
1213 return false;
1214
1215 ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217 if (cmp == NULL)
1218 return false;
1219
1220 switch (cmp->operation) {
1221 case ir_binop_less:
1222 case ir_binop_greater:
1223 case ir_binop_lequal:
1224 case ir_binop_gequal:
1225 case ir_binop_equal:
1226 case ir_binop_nequal:
1227 break;
1228
1229 default:
1230 return false;
1231 }
1232
1233 cmp->operands[0]->accept(this);
1234 const src_reg cmp_src0 = this->result;
1235
1236 cmp->operands[1]->accept(this);
1237 const src_reg cmp_src1 = this->result;
1238
1239 this->result = src_reg(this, ir->type);
1240
1241 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242 brw_conditional_for_comparison(cmp->operation)));
1243
1244 /* If the comparison is false, this->result will just happen to be zero.
1245 */
1246 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247 this->result, src_reg(1.0f));
1248 inst->predicate = BRW_PREDICATE_NORMAL;
1249 inst->predicate_inverse = true;
1250
1251 return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256 src_reg src0, src_reg src1)
1257 {
1258 vec4_instruction *inst;
1259
1260 if (devinfo->gen >= 6) {
1261 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262 inst->conditional_mod = conditionalmod;
1263 } else {
1264 emit(CMP(dst, src0, src1, conditionalmod));
1265
1266 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267 inst->predicate = BRW_PREDICATE_NORMAL;
1268 }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273 const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275 if (devinfo->gen >= 6) {
1276 /* Note that the instruction's argument order is reversed from GLSL
1277 * and the IR.
1278 */
1279 emit(LRP(dst,
1280 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281 } else {
1282 /* Earlier generations don't support three source operations, so we
1283 * need to emit x*(1-a) + y*a.
1284 */
1285 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1286 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1287 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288 y_times_a.writemask = dst.writemask;
1289 one_minus_a.writemask = dst.writemask;
1290 x_times_one_minus_a.writemask = dst.writemask;
1291
1292 emit(MUL(y_times_a, y, a));
1293 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296 }
1297 }
1298
1299 /**
1300 * Emits the instructions needed to perform a pull constant load. before_block
1301 * and before_inst can be NULL in which case the instruction will be appended
1302 * to the end of the instruction list.
1303 */
1304 void
1305 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1306 src_reg surf_index,
1307 src_reg offset_reg,
1308 bblock_t *before_block,
1309 vec4_instruction *before_inst)
1310 {
1311 assert((before_inst == NULL && before_block == NULL) ||
1312 (before_inst && before_block));
1313
1314 vec4_instruction *pull;
1315
1316 if (devinfo->gen >= 9) {
1317 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1318 src_reg header(this, glsl_type::uvec4_type, 2);
1319
1320 pull = new(mem_ctx)
1321 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1322 dst_reg(header));
1323
1324 if (before_inst)
1325 emit_before(before_block, before_inst, pull);
1326 else
1327 emit(pull);
1328
1329 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1330 offset_reg.type);
1331 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1332
1333 if (before_inst)
1334 emit_before(before_block, before_inst, pull);
1335 else
1336 emit(pull);
1337
1338 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1339 dst,
1340 surf_index,
1341 header);
1342 pull->mlen = 2;
1343 pull->header_size = 1;
1344 } else if (devinfo->gen >= 7) {
1345 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1346
1347 grf_offset.type = offset_reg.type;
1348
1349 pull = MOV(grf_offset, offset_reg);
1350
1351 if (before_inst)
1352 emit_before(before_block, before_inst, pull);
1353 else
1354 emit(pull);
1355
1356 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1357 dst,
1358 surf_index,
1359 src_reg(grf_offset));
1360 pull->mlen = 1;
1361 } else {
1362 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1363 dst,
1364 surf_index,
1365 offset_reg);
1366 pull->base_mrf = 14;
1367 pull->mlen = 1;
1368 }
1369
1370 if (before_inst)
1371 emit_before(before_block, before_inst, pull);
1372 else
1373 emit(pull);
1374 }
1375
1376 void
1377 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1378 {
1379 const src_reg chan_index(this, glsl_type::uint_type);
1380
1381 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1382 ->force_writemask_all = true;
1383 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1384 ->force_writemask_all = true;
1385 }
1386
1387 void
1388 vec4_visitor::visit(ir_expression *ir)
1389 {
1390 unsigned int operand;
1391 src_reg op[ARRAY_SIZE(ir->operands)];
1392 vec4_instruction *inst;
1393
1394 if (ir->operation == ir_binop_add) {
1395 if (try_emit_mad(ir))
1396 return;
1397 }
1398
1399 if (ir->operation == ir_unop_b2f) {
1400 if (try_emit_b2f_of_compare(ir))
1401 return;
1402 }
1403
1404 /* Storage for our result. Ideally for an assignment we'd be using
1405 * the actual storage for the result here, instead.
1406 */
1407 dst_reg result_dst(this, ir->type);
1408 src_reg result_src(result_dst);
1409
1410 if (ir->operation == ir_triop_csel) {
1411 ir->operands[1]->accept(this);
1412 op[1] = this->result;
1413 ir->operands[2]->accept(this);
1414 op[2] = this->result;
1415
1416 enum brw_predicate predicate;
1417 emit_bool_to_cond_code(ir->operands[0], &predicate);
1418 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1419 inst->predicate = predicate;
1420 this->result = result_src;
1421 return;
1422 }
1423
1424 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1425 this->result.file = BAD_FILE;
1426 ir->operands[operand]->accept(this);
1427 if (this->result.file == BAD_FILE) {
1428 fprintf(stderr, "Failed to get tree for expression operand:\n");
1429 ir->operands[operand]->fprint(stderr);
1430 exit(1);
1431 }
1432 op[operand] = this->result;
1433
1434 /* Matrix expression operands should have been broken down to vector
1435 * operations already.
1436 */
1437 assert(!ir->operands[operand]->type->is_matrix());
1438 }
1439
1440 /* If nothing special happens, this is the result. */
1441 this->result = result_src;
1442
1443 switch (ir->operation) {
1444 case ir_unop_logic_not:
1445 emit(NOT(result_dst, op[0]));
1446 break;
1447 case ir_unop_neg:
1448 op[0].negate = !op[0].negate;
1449 emit(MOV(result_dst, op[0]));
1450 break;
1451 case ir_unop_abs:
1452 op[0].abs = true;
1453 op[0].negate = false;
1454 emit(MOV(result_dst, op[0]));
1455 break;
1456
1457 case ir_unop_sign:
1458 if (ir->type->is_float()) {
1459 /* AND(val, 0x80000000) gives the sign bit.
1460 *
1461 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1462 * zero.
1463 */
1464 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1465
1466 op[0].type = BRW_REGISTER_TYPE_UD;
1467 result_dst.type = BRW_REGISTER_TYPE_UD;
1468 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1469
1470 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1471 inst->predicate = BRW_PREDICATE_NORMAL;
1472
1473 this->result.type = BRW_REGISTER_TYPE_F;
1474 } else {
1475 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1476 * -> non-negative val generates 0x00000000.
1477 * Predicated OR sets 1 if val is positive.
1478 */
1479 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1480
1481 emit(ASR(result_dst, op[0], src_reg(31)));
1482
1483 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1484 inst->predicate = BRW_PREDICATE_NORMAL;
1485 }
1486 break;
1487
1488 case ir_unop_rcp:
1489 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1490 break;
1491
1492 case ir_unop_exp2:
1493 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1494 break;
1495 case ir_unop_log2:
1496 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1497 break;
1498 case ir_unop_exp:
1499 case ir_unop_log:
1500 unreachable("not reached: should be handled by ir_explog_to_explog2");
1501 case ir_unop_sin:
1502 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1503 break;
1504 case ir_unop_cos:
1505 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1506 break;
1507
1508 case ir_unop_dFdx:
1509 case ir_unop_dFdx_coarse:
1510 case ir_unop_dFdx_fine:
1511 case ir_unop_dFdy:
1512 case ir_unop_dFdy_coarse:
1513 case ir_unop_dFdy_fine:
1514 unreachable("derivatives not valid in vertex shader");
1515
1516 case ir_unop_bitfield_reverse:
1517 emit(BFREV(result_dst, op[0]));
1518 break;
1519 case ir_unop_bit_count:
1520 emit(CBIT(result_dst, op[0]));
1521 break;
1522 case ir_unop_find_msb: {
1523 src_reg temp = src_reg(this, glsl_type::uint_type);
1524
1525 inst = emit(FBH(dst_reg(temp), op[0]));
1526 inst->dst.writemask = WRITEMASK_XYZW;
1527
1528 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1529 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1530 * subtract the result from 31 to convert the MSB count into an LSB count.
1531 */
1532
1533 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1534 temp.swizzle = BRW_SWIZZLE_NOOP;
1535 emit(MOV(result_dst, temp));
1536
1537 src_reg src_tmp = src_reg(result_dst);
1538 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1539
1540 src_tmp.negate = true;
1541 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1542 inst->predicate = BRW_PREDICATE_NORMAL;
1543 break;
1544 }
1545 case ir_unop_find_lsb:
1546 emit(FBL(result_dst, op[0]));
1547 break;
1548 case ir_unop_saturate:
1549 inst = emit(MOV(result_dst, op[0]));
1550 inst->saturate = true;
1551 break;
1552
1553 case ir_unop_noise:
1554 unreachable("not reached: should be handled by lower_noise");
1555
1556 case ir_binop_add:
1557 emit(ADD(result_dst, op[0], op[1]));
1558 break;
1559 case ir_binop_sub:
1560 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1561
1562 case ir_binop_mul:
1563 if (devinfo->gen < 8 && ir->type->is_integer()) {
1564 /* For integer multiplication, the MUL uses the low 16 bits of one of
1565 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1566 * accumulates in the contribution of the upper 16 bits of that
1567 * operand. If we can determine that one of the args is in the low
1568 * 16 bits, though, we can just emit a single MUL.
1569 */
1570 if (ir->operands[0]->is_uint16_constant()) {
1571 if (devinfo->gen < 7)
1572 emit(MUL(result_dst, op[0], op[1]));
1573 else
1574 emit(MUL(result_dst, op[1], op[0]));
1575 } else if (ir->operands[1]->is_uint16_constant()) {
1576 if (devinfo->gen < 7)
1577 emit(MUL(result_dst, op[1], op[0]));
1578 else
1579 emit(MUL(result_dst, op[0], op[1]));
1580 } else {
1581 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1582
1583 emit(MUL(acc, op[0], op[1]));
1584 emit(MACH(dst_null_d(), op[0], op[1]));
1585 emit(MOV(result_dst, src_reg(acc)));
1586 }
1587 } else {
1588 emit(MUL(result_dst, op[0], op[1]));
1589 }
1590 break;
1591 case ir_binop_imul_high: {
1592 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1593
1594 emit(MUL(acc, op[0], op[1]));
1595 emit(MACH(result_dst, op[0], op[1]));
1596 break;
1597 }
1598 case ir_binop_div:
1599 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1600 assert(ir->type->is_integer());
1601 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1602 break;
1603 case ir_binop_carry: {
1604 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1605
1606 emit(ADDC(dst_null_ud(), op[0], op[1]));
1607 emit(MOV(result_dst, src_reg(acc)));
1608 break;
1609 }
1610 case ir_binop_borrow: {
1611 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1612
1613 emit(SUBB(dst_null_ud(), op[0], op[1]));
1614 emit(MOV(result_dst, src_reg(acc)));
1615 break;
1616 }
1617 case ir_binop_mod:
1618 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1619 assert(ir->type->is_integer());
1620 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1621 break;
1622
1623 case ir_binop_less:
1624 case ir_binop_greater:
1625 case ir_binop_lequal:
1626 case ir_binop_gequal:
1627 case ir_binop_equal:
1628 case ir_binop_nequal: {
1629 if (devinfo->gen <= 5) {
1630 resolve_bool_comparison(ir->operands[0], &op[0]);
1631 resolve_bool_comparison(ir->operands[1], &op[1]);
1632 }
1633 emit(CMP(result_dst, op[0], op[1],
1634 brw_conditional_for_comparison(ir->operation)));
1635 break;
1636 }
1637
1638 case ir_binop_all_equal:
1639 if (devinfo->gen <= 5) {
1640 resolve_bool_comparison(ir->operands[0], &op[0]);
1641 resolve_bool_comparison(ir->operands[1], &op[1]);
1642 }
1643
1644 /* "==" operator producing a scalar boolean. */
1645 if (ir->operands[0]->type->is_vector() ||
1646 ir->operands[1]->type->is_vector()) {
1647 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1648 emit(MOV(result_dst, src_reg(0)));
1649 inst = emit(MOV(result_dst, src_reg(~0)));
1650 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1651 } else {
1652 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1653 }
1654 break;
1655 case ir_binop_any_nequal:
1656 if (devinfo->gen <= 5) {
1657 resolve_bool_comparison(ir->operands[0], &op[0]);
1658 resolve_bool_comparison(ir->operands[1], &op[1]);
1659 }
1660
1661 /* "!=" operator producing a scalar boolean. */
1662 if (ir->operands[0]->type->is_vector() ||
1663 ir->operands[1]->type->is_vector()) {
1664 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1665
1666 emit(MOV(result_dst, src_reg(0)));
1667 inst = emit(MOV(result_dst, src_reg(~0)));
1668 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1669 } else {
1670 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1671 }
1672 break;
1673
1674 case ir_unop_any:
1675 if (devinfo->gen <= 5) {
1676 resolve_bool_comparison(ir->operands[0], &op[0]);
1677 }
1678 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1679 emit(MOV(result_dst, src_reg(0)));
1680
1681 inst = emit(MOV(result_dst, src_reg(~0)));
1682 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1683 break;
1684
1685 case ir_binop_logic_xor:
1686 emit(XOR(result_dst, op[0], op[1]));
1687 break;
1688
1689 case ir_binop_logic_or:
1690 emit(OR(result_dst, op[0], op[1]));
1691 break;
1692
1693 case ir_binop_logic_and:
1694 emit(AND(result_dst, op[0], op[1]));
1695 break;
1696
1697 case ir_binop_dot:
1698 assert(ir->operands[0]->type->is_vector());
1699 assert(ir->operands[0]->type == ir->operands[1]->type);
1700 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1701 break;
1702
1703 case ir_unop_sqrt:
1704 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1705 break;
1706 case ir_unop_rsq:
1707 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1708 break;
1709
1710 case ir_unop_bitcast_i2f:
1711 case ir_unop_bitcast_u2f:
1712 this->result = op[0];
1713 this->result.type = BRW_REGISTER_TYPE_F;
1714 break;
1715
1716 case ir_unop_bitcast_f2i:
1717 this->result = op[0];
1718 this->result.type = BRW_REGISTER_TYPE_D;
1719 break;
1720
1721 case ir_unop_bitcast_f2u:
1722 this->result = op[0];
1723 this->result.type = BRW_REGISTER_TYPE_UD;
1724 break;
1725
1726 case ir_unop_i2f:
1727 case ir_unop_i2u:
1728 case ir_unop_u2i:
1729 case ir_unop_u2f:
1730 case ir_unop_f2i:
1731 case ir_unop_f2u:
1732 emit(MOV(result_dst, op[0]));
1733 break;
1734 case ir_unop_b2i:
1735 emit(AND(result_dst, op[0], src_reg(1)));
1736 break;
1737 case ir_unop_b2f:
1738 if (devinfo->gen <= 5) {
1739 resolve_bool_comparison(ir->operands[0], &op[0]);
1740 }
1741 op[0].type = BRW_REGISTER_TYPE_D;
1742 result_dst.type = BRW_REGISTER_TYPE_D;
1743 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1744 result_dst.type = BRW_REGISTER_TYPE_F;
1745 break;
1746 case ir_unop_f2b:
1747 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1748 break;
1749 case ir_unop_i2b:
1750 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1751 break;
1752
1753 case ir_unop_trunc:
1754 emit(RNDZ(result_dst, op[0]));
1755 break;
1756 case ir_unop_ceil: {
1757 src_reg tmp = src_reg(this, ir->type);
1758 op[0].negate = !op[0].negate;
1759 emit(RNDD(dst_reg(tmp), op[0]));
1760 tmp.negate = true;
1761 emit(MOV(result_dst, tmp));
1762 }
1763 break;
1764 case ir_unop_floor:
1765 inst = emit(RNDD(result_dst, op[0]));
1766 break;
1767 case ir_unop_fract:
1768 inst = emit(FRC(result_dst, op[0]));
1769 break;
1770 case ir_unop_round_even:
1771 emit(RNDE(result_dst, op[0]));
1772 break;
1773
1774 case ir_binop_min:
1775 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1776 break;
1777 case ir_binop_max:
1778 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1779 break;
1780
1781 case ir_binop_pow:
1782 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1783 break;
1784
1785 case ir_unop_bit_not:
1786 inst = emit(NOT(result_dst, op[0]));
1787 break;
1788 case ir_binop_bit_and:
1789 inst = emit(AND(result_dst, op[0], op[1]));
1790 break;
1791 case ir_binop_bit_xor:
1792 inst = emit(XOR(result_dst, op[0], op[1]));
1793 break;
1794 case ir_binop_bit_or:
1795 inst = emit(OR(result_dst, op[0], op[1]));
1796 break;
1797
1798 case ir_binop_lshift:
1799 inst = emit(SHL(result_dst, op[0], op[1]));
1800 break;
1801
1802 case ir_binop_rshift:
1803 if (ir->type->base_type == GLSL_TYPE_INT)
1804 inst = emit(ASR(result_dst, op[0], op[1]));
1805 else
1806 inst = emit(SHR(result_dst, op[0], op[1]));
1807 break;
1808
1809 case ir_binop_bfm:
1810 emit(BFI1(result_dst, op[0], op[1]));
1811 break;
1812
1813 case ir_binop_ubo_load: {
1814 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1815 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1816 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1817 src_reg offset;
1818
1819 /* Now, load the vector from that offset. */
1820 assert(ir->type->is_vector() || ir->type->is_scalar());
1821
1822 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1823 packed_consts.type = result.type;
1824 src_reg surf_index;
1825
1826 if (const_uniform_block) {
1827 /* The block index is a constant, so just emit the binding table entry
1828 * as an immediate.
1829 */
1830 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1831 const_uniform_block->value.u[0]);
1832 } else {
1833 /* The block index is not a constant. Evaluate the index expression
1834 * per-channel and add the base UBO index; we have to select a value
1835 * from any live channel.
1836 */
1837 surf_index = src_reg(this, glsl_type::uint_type);
1838 emit(ADD(dst_reg(surf_index), op[0],
1839 src_reg(prog_data->base.binding_table.ubo_start)));
1840 emit_uniformize(dst_reg(surf_index), surf_index);
1841
1842 /* Assume this may touch any UBO. It would be nice to provide
1843 * a tighter bound, but the array information is already lowered away.
1844 */
1845 brw_mark_surface_used(&prog_data->base,
1846 prog_data->base.binding_table.ubo_start +
1847 shader_prog->NumUniformBlocks - 1);
1848 }
1849
1850 if (const_offset_ir) {
1851 if (devinfo->gen >= 8) {
1852 /* Store the offset in a GRF so we can send-from-GRF. */
1853 offset = src_reg(this, glsl_type::int_type);
1854 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1855 } else {
1856 /* Immediates are fine on older generations since they'll be moved
1857 * to a (potentially fake) MRF at the generator level.
1858 */
1859 offset = src_reg(const_offset / 16);
1860 }
1861 } else {
1862 offset = src_reg(this, glsl_type::uint_type);
1863 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1864 }
1865
1866 emit_pull_constant_load_reg(dst_reg(packed_consts),
1867 surf_index,
1868 offset,
1869 NULL, NULL /* before_block/inst */);
1870
1871 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1872 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1873 const_offset % 16 / 4,
1874 const_offset % 16 / 4,
1875 const_offset % 16 / 4);
1876
1877 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1878 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1879 emit(CMP(result_dst, packed_consts, src_reg(0u),
1880 BRW_CONDITIONAL_NZ));
1881 } else {
1882 emit(MOV(result_dst, packed_consts));
1883 }
1884 break;
1885 }
1886
1887 case ir_binop_vector_extract:
1888 unreachable("should have been lowered by vec_index_to_cond_assign");
1889
1890 case ir_triop_fma:
1891 op[0] = fix_3src_operand(op[0]);
1892 op[1] = fix_3src_operand(op[1]);
1893 op[2] = fix_3src_operand(op[2]);
1894 /* Note that the instruction's argument order is reversed from GLSL
1895 * and the IR.
1896 */
1897 emit(MAD(result_dst, op[2], op[1], op[0]));
1898 break;
1899
1900 case ir_triop_lrp:
1901 emit_lrp(result_dst, op[0], op[1], op[2]);
1902 break;
1903
1904 case ir_triop_csel:
1905 unreachable("already handled above");
1906 break;
1907
1908 case ir_triop_bfi:
1909 op[0] = fix_3src_operand(op[0]);
1910 op[1] = fix_3src_operand(op[1]);
1911 op[2] = fix_3src_operand(op[2]);
1912 emit(BFI2(result_dst, op[0], op[1], op[2]));
1913 break;
1914
1915 case ir_triop_bitfield_extract:
1916 op[0] = fix_3src_operand(op[0]);
1917 op[1] = fix_3src_operand(op[1]);
1918 op[2] = fix_3src_operand(op[2]);
1919 /* Note that the instruction's argument order is reversed from GLSL
1920 * and the IR.
1921 */
1922 emit(BFE(result_dst, op[2], op[1], op[0]));
1923 break;
1924
1925 case ir_triop_vector_insert:
1926 unreachable("should have been lowered by lower_vector_insert");
1927
1928 case ir_quadop_bitfield_insert:
1929 unreachable("not reached: should be handled by "
1930 "bitfield_insert_to_bfm_bfi\n");
1931
1932 case ir_quadop_vector:
1933 unreachable("not reached: should be handled by lower_quadop_vector");
1934
1935 case ir_unop_pack_half_2x16:
1936 emit_pack_half_2x16(result_dst, op[0]);
1937 break;
1938 case ir_unop_unpack_half_2x16:
1939 emit_unpack_half_2x16(result_dst, op[0]);
1940 break;
1941 case ir_unop_unpack_unorm_4x8:
1942 emit_unpack_unorm_4x8(result_dst, op[0]);
1943 break;
1944 case ir_unop_unpack_snorm_4x8:
1945 emit_unpack_snorm_4x8(result_dst, op[0]);
1946 break;
1947 case ir_unop_pack_unorm_4x8:
1948 emit_pack_unorm_4x8(result_dst, op[0]);
1949 break;
1950 case ir_unop_pack_snorm_4x8:
1951 emit_pack_snorm_4x8(result_dst, op[0]);
1952 break;
1953 case ir_unop_pack_snorm_2x16:
1954 case ir_unop_pack_unorm_2x16:
1955 case ir_unop_unpack_snorm_2x16:
1956 case ir_unop_unpack_unorm_2x16:
1957 unreachable("not reached: should be handled by lower_packing_builtins");
1958 case ir_unop_unpack_half_2x16_split_x:
1959 case ir_unop_unpack_half_2x16_split_y:
1960 case ir_binop_pack_half_2x16_split:
1961 case ir_unop_interpolate_at_centroid:
1962 case ir_binop_interpolate_at_sample:
1963 case ir_binop_interpolate_at_offset:
1964 unreachable("not reached: should not occur in vertex shader");
1965 case ir_binop_ldexp:
1966 unreachable("not reached: should be handled by ldexp_to_arith()");
1967 case ir_unop_d2f:
1968 case ir_unop_f2d:
1969 case ir_unop_d2i:
1970 case ir_unop_i2d:
1971 case ir_unop_d2u:
1972 case ir_unop_u2d:
1973 case ir_unop_d2b:
1974 case ir_unop_pack_double_2x32:
1975 case ir_unop_unpack_double_2x32:
1976 case ir_unop_frexp_sig:
1977 case ir_unop_frexp_exp:
1978 unreachable("fp64 todo");
1979 }
1980 }
1981
1982
1983 void
1984 vec4_visitor::visit(ir_swizzle *ir)
1985 {
1986 /* Note that this is only swizzles in expressions, not those on the left
1987 * hand side of an assignment, which do write masking. See ir_assignment
1988 * for that.
1989 */
1990 const unsigned swz = brw_compose_swizzle(
1991 brw_swizzle_for_size(ir->type->vector_elements),
1992 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1993
1994 ir->val->accept(this);
1995 this->result = swizzle(this->result, swz);
1996 }
1997
1998 void
1999 vec4_visitor::visit(ir_dereference_variable *ir)
2000 {
2001 const struct glsl_type *type = ir->type;
2002 dst_reg *reg = variable_storage(ir->var);
2003
2004 if (!reg) {
2005 fail("Failed to find variable storage for %s\n", ir->var->name);
2006 this->result = src_reg(brw_null_reg());
2007 return;
2008 }
2009
2010 this->result = src_reg(*reg);
2011
2012 /* System values get their swizzle from the dst_reg writemask */
2013 if (ir->var->data.mode == ir_var_system_value)
2014 return;
2015
2016 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2017 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2018 }
2019
2020
2021 int
2022 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2023 {
2024 /* Under normal circumstances array elements are stored consecutively, so
2025 * the stride is equal to the size of the array element.
2026 */
2027 return type_size(ir->type);
2028 }
2029
2030
2031 void
2032 vec4_visitor::visit(ir_dereference_array *ir)
2033 {
2034 ir_constant *constant_index;
2035 src_reg src;
2036 int array_stride = compute_array_stride(ir);
2037
2038 constant_index = ir->array_index->constant_expression_value();
2039
2040 ir->array->accept(this);
2041 src = this->result;
2042
2043 if (constant_index) {
2044 src.reg_offset += constant_index->value.i[0] * array_stride;
2045 } else {
2046 /* Variable index array dereference. It eats the "vec4" of the
2047 * base of the array and an index that offsets the Mesa register
2048 * index.
2049 */
2050 ir->array_index->accept(this);
2051
2052 src_reg index_reg;
2053
2054 if (array_stride == 1) {
2055 index_reg = this->result;
2056 } else {
2057 index_reg = src_reg(this, glsl_type::int_type);
2058
2059 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2060 }
2061
2062 if (src.reladdr) {
2063 src_reg temp = src_reg(this, glsl_type::int_type);
2064
2065 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2066
2067 index_reg = temp;
2068 }
2069
2070 src.reladdr = ralloc(mem_ctx, src_reg);
2071 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2072 }
2073
2074 /* If the type is smaller than a vec4, replicate the last channel out. */
2075 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2076 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2077 else
2078 src.swizzle = BRW_SWIZZLE_NOOP;
2079 src.type = brw_type_for_base_type(ir->type);
2080
2081 this->result = src;
2082 }
2083
2084 void
2085 vec4_visitor::visit(ir_dereference_record *ir)
2086 {
2087 unsigned int i;
2088 const glsl_type *struct_type = ir->record->type;
2089 int offset = 0;
2090
2091 ir->record->accept(this);
2092
2093 for (i = 0; i < struct_type->length; i++) {
2094 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2095 break;
2096 offset += type_size(struct_type->fields.structure[i].type);
2097 }
2098
2099 /* If the type is smaller than a vec4, replicate the last channel out. */
2100 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2101 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2102 else
2103 this->result.swizzle = BRW_SWIZZLE_NOOP;
2104 this->result.type = brw_type_for_base_type(ir->type);
2105
2106 this->result.reg_offset += offset;
2107 }
2108
2109 /**
2110 * We want to be careful in assignment setup to hit the actual storage
2111 * instead of potentially using a temporary like we might with the
2112 * ir_dereference handler.
2113 */
2114 static dst_reg
2115 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2116 {
2117 /* The LHS must be a dereference. If the LHS is a variable indexed array
2118 * access of a vector, it must be separated into a series conditional moves
2119 * before reaching this point (see ir_vec_index_to_cond_assign).
2120 */
2121 assert(ir->as_dereference());
2122 ir_dereference_array *deref_array = ir->as_dereference_array();
2123 if (deref_array) {
2124 assert(!deref_array->array->type->is_vector());
2125 }
2126
2127 /* Use the rvalue deref handler for the most part. We'll ignore
2128 * swizzles in it and write swizzles using writemask, though.
2129 */
2130 ir->accept(v);
2131 return dst_reg(v->result);
2132 }
2133
2134 void
2135 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2136 const struct glsl_type *type,
2137 enum brw_predicate predicate)
2138 {
2139 if (type->base_type == GLSL_TYPE_STRUCT) {
2140 for (unsigned int i = 0; i < type->length; i++) {
2141 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2142 }
2143 return;
2144 }
2145
2146 if (type->is_array()) {
2147 for (unsigned int i = 0; i < type->length; i++) {
2148 emit_block_move(dst, src, type->fields.array, predicate);
2149 }
2150 return;
2151 }
2152
2153 if (type->is_matrix()) {
2154 const struct glsl_type *vec_type;
2155
2156 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2157 type->vector_elements, 1);
2158
2159 for (int i = 0; i < type->matrix_columns; i++) {
2160 emit_block_move(dst, src, vec_type, predicate);
2161 }
2162 return;
2163 }
2164
2165 assert(type->is_scalar() || type->is_vector());
2166
2167 dst->type = brw_type_for_base_type(type);
2168 src->type = dst->type;
2169
2170 dst->writemask = (1 << type->vector_elements) - 1;
2171
2172 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2173
2174 vec4_instruction *inst = emit(MOV(*dst, *src));
2175 inst->predicate = predicate;
2176
2177 dst->reg_offset++;
2178 src->reg_offset++;
2179 }
2180
2181
2182 /* If the RHS processing resulted in an instruction generating a
2183 * temporary value, and it would be easy to rewrite the instruction to
2184 * generate its result right into the LHS instead, do so. This ends
2185 * up reliably removing instructions where it can be tricky to do so
2186 * later without real UD chain information.
2187 */
2188 bool
2189 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2190 dst_reg dst,
2191 src_reg src,
2192 vec4_instruction *pre_rhs_inst,
2193 vec4_instruction *last_rhs_inst)
2194 {
2195 /* This could be supported, but it would take more smarts. */
2196 if (ir->condition)
2197 return false;
2198
2199 if (pre_rhs_inst == last_rhs_inst)
2200 return false; /* No instructions generated to work with. */
2201
2202 /* Make sure the last instruction generated our source reg. */
2203 if (src.file != GRF ||
2204 src.file != last_rhs_inst->dst.file ||
2205 src.reg != last_rhs_inst->dst.reg ||
2206 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2207 src.reladdr ||
2208 src.abs ||
2209 src.negate ||
2210 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2211 return false;
2212
2213 /* Check that that last instruction fully initialized the channels
2214 * we want to use, in the order we want to use them. We could
2215 * potentially reswizzle the operands of many instructions so that
2216 * we could handle out of order channels, but don't yet.
2217 */
2218
2219 for (unsigned i = 0; i < 4; i++) {
2220 if (dst.writemask & (1 << i)) {
2221 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2222 return false;
2223
2224 if (BRW_GET_SWZ(src.swizzle, i) != i)
2225 return false;
2226 }
2227 }
2228
2229 /* Success! Rewrite the instruction. */
2230 last_rhs_inst->dst.file = dst.file;
2231 last_rhs_inst->dst.reg = dst.reg;
2232 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2233 last_rhs_inst->dst.reladdr = dst.reladdr;
2234 last_rhs_inst->dst.writemask &= dst.writemask;
2235
2236 return true;
2237 }
2238
2239 void
2240 vec4_visitor::visit(ir_assignment *ir)
2241 {
2242 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2243 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2244
2245 if (!ir->lhs->type->is_scalar() &&
2246 !ir->lhs->type->is_vector()) {
2247 ir->rhs->accept(this);
2248 src_reg src = this->result;
2249
2250 if (ir->condition) {
2251 emit_bool_to_cond_code(ir->condition, &predicate);
2252 }
2253
2254 /* emit_block_move doesn't account for swizzles in the source register.
2255 * This should be ok, since the source register is a structure or an
2256 * array, and those can't be swizzled. But double-check to be sure.
2257 */
2258 assert(src.swizzle ==
2259 (ir->rhs->type->is_matrix()
2260 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2261 : BRW_SWIZZLE_NOOP));
2262
2263 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2264 return;
2265 }
2266
2267 /* Now we're down to just a scalar/vector with writemasks. */
2268 int i;
2269
2270 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2271 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2272
2273 ir->rhs->accept(this);
2274
2275 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2276
2277 int swizzles[4];
2278 int src_chan = 0;
2279
2280 assert(ir->lhs->type->is_vector() ||
2281 ir->lhs->type->is_scalar());
2282 dst.writemask = ir->write_mask;
2283
2284 /* Swizzle a small RHS vector into the channels being written.
2285 *
2286 * glsl ir treats write_mask as dictating how many channels are
2287 * present on the RHS while in our instructions we need to make
2288 * those channels appear in the slots of the vec4 they're written to.
2289 */
2290 for (int i = 0; i < 4; i++)
2291 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2292
2293 src_reg src = swizzle(this->result,
2294 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2295 swizzles[2], swizzles[3]));
2296
2297 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2298 return;
2299 }
2300
2301 if (ir->condition) {
2302 emit_bool_to_cond_code(ir->condition, &predicate);
2303 }
2304
2305 for (i = 0; i < type_size(ir->lhs->type); i++) {
2306 vec4_instruction *inst = emit(MOV(dst, src));
2307 inst->predicate = predicate;
2308
2309 dst.reg_offset++;
2310 src.reg_offset++;
2311 }
2312 }
2313
2314 void
2315 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2316 {
2317 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2318 foreach_in_list(ir_constant, field_value, &ir->components) {
2319 emit_constant_values(dst, field_value);
2320 }
2321 return;
2322 }
2323
2324 if (ir->type->is_array()) {
2325 for (unsigned int i = 0; i < ir->type->length; i++) {
2326 emit_constant_values(dst, ir->array_elements[i]);
2327 }
2328 return;
2329 }
2330
2331 if (ir->type->is_matrix()) {
2332 for (int i = 0; i < ir->type->matrix_columns; i++) {
2333 float *vec = &ir->value.f[i * ir->type->vector_elements];
2334
2335 for (int j = 0; j < ir->type->vector_elements; j++) {
2336 dst->writemask = 1 << j;
2337 dst->type = BRW_REGISTER_TYPE_F;
2338
2339 emit(MOV(*dst, src_reg(vec[j])));
2340 }
2341 dst->reg_offset++;
2342 }
2343 return;
2344 }
2345
2346 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2347
2348 for (int i = 0; i < ir->type->vector_elements; i++) {
2349 if (!(remaining_writemask & (1 << i)))
2350 continue;
2351
2352 dst->writemask = 1 << i;
2353 dst->type = brw_type_for_base_type(ir->type);
2354
2355 /* Find other components that match the one we're about to
2356 * write. Emits fewer instructions for things like vec4(0.5,
2357 * 1.5, 1.5, 1.5).
2358 */
2359 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2360 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2361 if (ir->value.b[i] == ir->value.b[j])
2362 dst->writemask |= (1 << j);
2363 } else {
2364 /* u, i, and f storage all line up, so no need for a
2365 * switch case for comparing each type.
2366 */
2367 if (ir->value.u[i] == ir->value.u[j])
2368 dst->writemask |= (1 << j);
2369 }
2370 }
2371
2372 switch (ir->type->base_type) {
2373 case GLSL_TYPE_FLOAT:
2374 emit(MOV(*dst, src_reg(ir->value.f[i])));
2375 break;
2376 case GLSL_TYPE_INT:
2377 emit(MOV(*dst, src_reg(ir->value.i[i])));
2378 break;
2379 case GLSL_TYPE_UINT:
2380 emit(MOV(*dst, src_reg(ir->value.u[i])));
2381 break;
2382 case GLSL_TYPE_BOOL:
2383 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2384 break;
2385 default:
2386 unreachable("Non-float/uint/int/bool constant");
2387 }
2388
2389 remaining_writemask &= ~dst->writemask;
2390 }
2391 dst->reg_offset++;
2392 }
2393
2394 void
2395 vec4_visitor::visit(ir_constant *ir)
2396 {
2397 dst_reg dst = dst_reg(this, ir->type);
2398 this->result = src_reg(dst);
2399
2400 emit_constant_values(&dst, ir);
2401 }
2402
2403 void
2404 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2405 {
2406 ir_dereference *deref = static_cast<ir_dereference *>(
2407 ir->actual_parameters.get_head());
2408 ir_variable *location = deref->variable_referenced();
2409 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2410 location->data.binding);
2411
2412 /* Calculate the surface offset */
2413 src_reg offset(this, glsl_type::uint_type);
2414 ir_dereference_array *deref_array = deref->as_dereference_array();
2415 if (deref_array) {
2416 deref_array->array_index->accept(this);
2417
2418 src_reg tmp(this, glsl_type::uint_type);
2419 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2420 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2421 } else {
2422 offset = location->data.atomic.offset;
2423 }
2424
2425 /* Emit the appropriate machine instruction */
2426 const char *callee = ir->callee->function_name();
2427 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2428
2429 if (!strcmp("__intrinsic_atomic_read", callee)) {
2430 emit_untyped_surface_read(surf_index, dst, offset);
2431
2432 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2433 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2434 src_reg(), src_reg());
2435
2436 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2437 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2438 src_reg(), src_reg());
2439 }
2440 }
2441
2442 void
2443 vec4_visitor::visit(ir_call *ir)
2444 {
2445 const char *callee = ir->callee->function_name();
2446
2447 if (!strcmp("__intrinsic_atomic_read", callee) ||
2448 !strcmp("__intrinsic_atomic_increment", callee) ||
2449 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2450 visit_atomic_counter_intrinsic(ir);
2451 } else {
2452 unreachable("Unsupported intrinsic.");
2453 }
2454 }
2455
2456 src_reg
2457 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2458 {
2459 vec4_instruction *inst =
2460 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2461 dst_reg(this, glsl_type::uvec4_type));
2462 inst->base_mrf = 2;
2463 inst->src[1] = sampler;
2464
2465 int param_base;
2466
2467 if (devinfo->gen >= 9) {
2468 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2469 vec4_instruction *header_inst = new(mem_ctx)
2470 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2471 dst_reg(MRF, inst->base_mrf));
2472
2473 emit(header_inst);
2474
2475 inst->mlen = 2;
2476 inst->header_size = 1;
2477 param_base = inst->base_mrf + 1;
2478 } else {
2479 inst->mlen = 1;
2480 param_base = inst->base_mrf;
2481 }
2482
2483 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2484 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2485 int zero_mask = 0xf & ~coord_mask;
2486
2487 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2488 coordinate));
2489
2490 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2491 src_reg(0)));
2492
2493 emit(inst);
2494 return src_reg(inst->dst);
2495 }
2496
2497 static bool
2498 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2499 {
2500 if (devinfo->gen < 8 && !devinfo->is_haswell)
2501 return false;
2502
2503 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2504 }
2505
2506 void
2507 vec4_visitor::visit(ir_texture *ir)
2508 {
2509 uint32_t sampler =
2510 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2511
2512 ir_rvalue *nonconst_sampler_index =
2513 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2514
2515 /* Handle non-constant sampler array indexing */
2516 src_reg sampler_reg;
2517 if (nonconst_sampler_index) {
2518 /* The highest sampler which may be used by this operation is
2519 * the last element of the array. Mark it here, because the generator
2520 * doesn't have enough information to determine the bound.
2521 */
2522 uint32_t array_size = ir->sampler->as_dereference_array()
2523 ->array->type->array_size();
2524
2525 uint32_t max_used = sampler + array_size - 1;
2526 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2527 max_used += prog_data->base.binding_table.gather_texture_start;
2528 } else {
2529 max_used += prog_data->base.binding_table.texture_start;
2530 }
2531
2532 brw_mark_surface_used(&prog_data->base, max_used);
2533
2534 /* Emit code to evaluate the actual indexing expression */
2535 nonconst_sampler_index->accept(this);
2536 dst_reg temp(this, glsl_type::uint_type);
2537 emit(ADD(temp, this->result, src_reg(sampler)));
2538 emit_uniformize(temp, src_reg(temp));
2539
2540 sampler_reg = src_reg(temp);
2541 } else {
2542 /* Single sampler, or constant array index; the indexing expression
2543 * is just an immediate.
2544 */
2545 sampler_reg = src_reg(sampler);
2546 }
2547
2548 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2549 * emitting anything other than setting up the constant result.
2550 */
2551 if (ir->op == ir_tg4) {
2552 ir_constant *chan = ir->lod_info.component->as_constant();
2553 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2554 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2555 dst_reg result(this, ir->type);
2556 this->result = src_reg(result);
2557 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2558 return;
2559 }
2560 }
2561
2562 /* Should be lowered by do_lower_texture_projection */
2563 assert(!ir->projector);
2564
2565 /* Should be lowered */
2566 assert(!ir->offset || !ir->offset->type->is_array());
2567
2568 /* Generate code to compute all the subexpression trees. This has to be
2569 * done before loading any values into MRFs for the sampler message since
2570 * generating these values may involve SEND messages that need the MRFs.
2571 */
2572 src_reg coordinate;
2573 if (ir->coordinate) {
2574 ir->coordinate->accept(this);
2575 coordinate = this->result;
2576 }
2577
2578 src_reg shadow_comparitor;
2579 if (ir->shadow_comparitor) {
2580 ir->shadow_comparitor->accept(this);
2581 shadow_comparitor = this->result;
2582 }
2583
2584 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2585 src_reg offset_value;
2586 if (has_nonconstant_offset) {
2587 ir->offset->accept(this);
2588 offset_value = src_reg(this->result);
2589 }
2590
2591 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2592 src_reg lod, dPdx, dPdy, sample_index, mcs;
2593 switch (ir->op) {
2594 case ir_tex:
2595 lod = src_reg(0.0f);
2596 lod_type = glsl_type::float_type;
2597 break;
2598 case ir_txf:
2599 case ir_txl:
2600 case ir_txs:
2601 ir->lod_info.lod->accept(this);
2602 lod = this->result;
2603 lod_type = ir->lod_info.lod->type;
2604 break;
2605 case ir_query_levels:
2606 lod = src_reg(0);
2607 lod_type = glsl_type::int_type;
2608 break;
2609 case ir_txf_ms:
2610 ir->lod_info.sample_index->accept(this);
2611 sample_index = this->result;
2612 sample_index_type = ir->lod_info.sample_index->type;
2613
2614 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2615 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2616 else
2617 mcs = src_reg(0u);
2618 break;
2619 case ir_txd:
2620 ir->lod_info.grad.dPdx->accept(this);
2621 dPdx = this->result;
2622
2623 ir->lod_info.grad.dPdy->accept(this);
2624 dPdy = this->result;
2625
2626 lod_type = ir->lod_info.grad.dPdx->type;
2627 break;
2628 case ir_txb:
2629 case ir_lod:
2630 case ir_tg4:
2631 break;
2632 }
2633
2634 enum opcode opcode;
2635 switch (ir->op) {
2636 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2637 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2638 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2639 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2640 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2641 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2642 case ir_tg4: opcode = has_nonconstant_offset
2643 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2644 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2645 case ir_txb:
2646 unreachable("TXB is not valid for vertex shaders.");
2647 case ir_lod:
2648 unreachable("LOD is not valid for vertex shaders.");
2649 default:
2650 unreachable("Unrecognized tex op");
2651 }
2652
2653 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2654 opcode, dst_reg(this, ir->type));
2655
2656 if (ir->offset != NULL && !has_nonconstant_offset) {
2657 inst->offset =
2658 brw_texture_offset(ir->offset->as_constant()->value.i,
2659 ir->offset->type->vector_elements);
2660 }
2661
2662 /* Stuff the channel select bits in the top of the texture offset */
2663 if (ir->op == ir_tg4)
2664 inst->offset |= gather_channel(ir, sampler) << 16;
2665
2666 /* The message header is necessary for:
2667 * - Gen4 (always)
2668 * - Gen9+ for selecting SIMD4x2
2669 * - Texel offsets
2670 * - Gather channel selection
2671 * - Sampler indices too large to fit in a 4-bit value.
2672 */
2673 inst->header_size =
2674 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2675 inst->offset != 0 || ir->op == ir_tg4 ||
2676 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2677 inst->base_mrf = 2;
2678 inst->mlen = inst->header_size + 1; /* always at least one */
2679 inst->dst.writemask = WRITEMASK_XYZW;
2680 inst->shadow_compare = ir->shadow_comparitor != NULL;
2681
2682 inst->src[1] = sampler_reg;
2683
2684 /* MRF for the first parameter */
2685 int param_base = inst->base_mrf + inst->header_size;
2686
2687 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2688 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2689 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2690 } else {
2691 /* Load the coordinate */
2692 /* FINISHME: gl_clamp_mask and saturate */
2693 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2694 int zero_mask = 0xf & ~coord_mask;
2695
2696 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2697 coordinate));
2698
2699 if (zero_mask != 0) {
2700 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2701 src_reg(0)));
2702 }
2703 /* Load the shadow comparitor */
2704 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2705 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2706 WRITEMASK_X),
2707 shadow_comparitor));
2708 inst->mlen++;
2709 }
2710
2711 /* Load the LOD info */
2712 if (ir->op == ir_tex || ir->op == ir_txl) {
2713 int mrf, writemask;
2714 if (devinfo->gen >= 5) {
2715 mrf = param_base + 1;
2716 if (ir->shadow_comparitor) {
2717 writemask = WRITEMASK_Y;
2718 /* mlen already incremented */
2719 } else {
2720 writemask = WRITEMASK_X;
2721 inst->mlen++;
2722 }
2723 } else /* devinfo->gen == 4 */ {
2724 mrf = param_base;
2725 writemask = WRITEMASK_W;
2726 }
2727 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2728 } else if (ir->op == ir_txf) {
2729 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2730 } else if (ir->op == ir_txf_ms) {
2731 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2732 sample_index));
2733 if (devinfo->gen >= 7) {
2734 /* MCS data is in the first channel of `mcs`, but we need to get it into
2735 * the .y channel of the second vec4 of params, so replicate .x across
2736 * the whole vec4 and then mask off everything except .y
2737 */
2738 mcs.swizzle = BRW_SWIZZLE_XXXX;
2739 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2740 mcs));
2741 }
2742 inst->mlen++;
2743 } else if (ir->op == ir_txd) {
2744 const glsl_type *type = lod_type;
2745
2746 if (devinfo->gen >= 5) {
2747 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2748 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2749 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2750 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2751 inst->mlen++;
2752
2753 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2754 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2755 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2756 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2757 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2758 inst->mlen++;
2759
2760 if (ir->shadow_comparitor) {
2761 emit(MOV(dst_reg(MRF, param_base + 2,
2762 ir->shadow_comparitor->type, WRITEMASK_Z),
2763 shadow_comparitor));
2764 }
2765 }
2766 } else /* devinfo->gen == 4 */ {
2767 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2768 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2769 inst->mlen += 2;
2770 }
2771 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2772 if (ir->shadow_comparitor) {
2773 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2774 shadow_comparitor));
2775 }
2776
2777 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2778 offset_value));
2779 inst->mlen++;
2780 }
2781 }
2782
2783 emit(inst);
2784
2785 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2786 * spec requires layers.
2787 */
2788 if (ir->op == ir_txs) {
2789 glsl_type const *type = ir->sampler->type;
2790 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2791 type->sampler_array) {
2792 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2793 writemask(inst->dst, WRITEMASK_Z),
2794 src_reg(inst->dst), src_reg(6));
2795 }
2796 }
2797
2798 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2799 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2800 }
2801
2802 swizzle_result(ir, src_reg(inst->dst), sampler);
2803 }
2804
2805 /**
2806 * Apply workarounds for Gen6 gather with UINT/SINT
2807 */
2808 void
2809 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2810 {
2811 if (!wa)
2812 return;
2813
2814 int width = (wa & WA_8BIT) ? 8 : 16;
2815 dst_reg dst_f = dst;
2816 dst_f.type = BRW_REGISTER_TYPE_F;
2817
2818 /* Convert from UNORM to UINT */
2819 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2820 emit(MOV(dst, src_reg(dst_f)));
2821
2822 if (wa & WA_SIGN) {
2823 /* Reinterpret the UINT value as a signed INT value by
2824 * shifting the sign bit into place, then shifting back
2825 * preserving sign.
2826 */
2827 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2828 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2829 }
2830 }
2831
2832 /**
2833 * Set up the gather channel based on the swizzle, for gather4.
2834 */
2835 uint32_t
2836 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2837 {
2838 ir_constant *chan = ir->lod_info.component->as_constant();
2839 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2840 switch (swiz) {
2841 case SWIZZLE_X: return 0;
2842 case SWIZZLE_Y:
2843 /* gather4 sampler is broken for green channel on RG32F --
2844 * we must ask for blue instead.
2845 */
2846 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2847 return 2;
2848 return 1;
2849 case SWIZZLE_Z: return 2;
2850 case SWIZZLE_W: return 3;
2851 default:
2852 unreachable("Not reached"); /* zero, one swizzles handled already */
2853 }
2854 }
2855
2856 void
2857 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2858 {
2859 int s = key->tex.swizzles[sampler];
2860
2861 this->result = src_reg(this, ir->type);
2862 dst_reg swizzled_result(this->result);
2863
2864 if (ir->op == ir_query_levels) {
2865 /* # levels is in .w */
2866 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2867 emit(MOV(swizzled_result, orig_val));
2868 return;
2869 }
2870
2871 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2872 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2873 emit(MOV(swizzled_result, orig_val));
2874 return;
2875 }
2876
2877
2878 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2879 int swizzle[4] = {0};
2880
2881 for (int i = 0; i < 4; i++) {
2882 switch (GET_SWZ(s, i)) {
2883 case SWIZZLE_ZERO:
2884 zero_mask |= (1 << i);
2885 break;
2886 case SWIZZLE_ONE:
2887 one_mask |= (1 << i);
2888 break;
2889 default:
2890 copy_mask |= (1 << i);
2891 swizzle[i] = GET_SWZ(s, i);
2892 break;
2893 }
2894 }
2895
2896 if (copy_mask) {
2897 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2898 swizzled_result.writemask = copy_mask;
2899 emit(MOV(swizzled_result, orig_val));
2900 }
2901
2902 if (zero_mask) {
2903 swizzled_result.writemask = zero_mask;
2904 emit(MOV(swizzled_result, src_reg(0.0f)));
2905 }
2906
2907 if (one_mask) {
2908 swizzled_result.writemask = one_mask;
2909 emit(MOV(swizzled_result, src_reg(1.0f)));
2910 }
2911 }
2912
2913 void
2914 vec4_visitor::visit(ir_return *)
2915 {
2916 unreachable("not reached");
2917 }
2918
2919 void
2920 vec4_visitor::visit(ir_discard *)
2921 {
2922 unreachable("not reached");
2923 }
2924
2925 void
2926 vec4_visitor::visit(ir_if *ir)
2927 {
2928 /* Don't point the annotation at the if statement, because then it plus
2929 * the then and else blocks get printed.
2930 */
2931 this->base_ir = ir->condition;
2932
2933 if (devinfo->gen == 6) {
2934 emit_if_gen6(ir);
2935 } else {
2936 enum brw_predicate predicate;
2937 emit_bool_to_cond_code(ir->condition, &predicate);
2938 emit(IF(predicate));
2939 }
2940
2941 visit_instructions(&ir->then_instructions);
2942
2943 if (!ir->else_instructions.is_empty()) {
2944 this->base_ir = ir->condition;
2945 emit(BRW_OPCODE_ELSE);
2946
2947 visit_instructions(&ir->else_instructions);
2948 }
2949
2950 this->base_ir = ir->condition;
2951 emit(BRW_OPCODE_ENDIF);
2952 }
2953
2954 void
2955 vec4_visitor::visit(ir_emit_vertex *)
2956 {
2957 unreachable("not reached");
2958 }
2959
2960 void
2961 vec4_visitor::visit(ir_end_primitive *)
2962 {
2963 unreachable("not reached");
2964 }
2965
2966 void
2967 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2968 dst_reg dst, src_reg offset,
2969 src_reg src0, src_reg src1)
2970 {
2971 unsigned mlen = 0;
2972
2973 /* Set the atomic operation offset. */
2974 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2975 mlen++;
2976
2977 /* Set the atomic operation arguments. */
2978 if (src0.file != BAD_FILE) {
2979 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2980 mlen++;
2981 }
2982
2983 if (src1.file != BAD_FILE) {
2984 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2985 mlen++;
2986 }
2987
2988 /* Emit the instruction. Note that this maps to the normal SIMD8
2989 * untyped atomic message on Ivy Bridge, but that's OK because
2990 * unused channels will be masked out.
2991 */
2992 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2993 brw_message_reg(0),
2994 src_reg(surf_index), src_reg(atomic_op));
2995 inst->mlen = mlen;
2996 }
2997
2998 void
2999 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3000 src_reg offset)
3001 {
3002 /* Set the surface read offset. */
3003 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3004
3005 /* Emit the instruction. Note that this maps to the normal SIMD8
3006 * untyped surface read message, but that's OK because unused
3007 * channels will be masked out.
3008 */
3009 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3010 brw_message_reg(0),
3011 src_reg(surf_index), src_reg(1));
3012 inst->mlen = 1;
3013 }
3014
3015 void
3016 vec4_visitor::emit_ndc_computation()
3017 {
3018 /* Get the position */
3019 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3020
3021 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3022 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3023 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3024
3025 current_annotation = "NDC";
3026 dst_reg ndc_w = ndc;
3027 ndc_w.writemask = WRITEMASK_W;
3028 src_reg pos_w = pos;
3029 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3030 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3031
3032 dst_reg ndc_xyz = ndc;
3033 ndc_xyz.writemask = WRITEMASK_XYZ;
3034
3035 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3036 }
3037
3038 void
3039 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3040 {
3041 if (devinfo->gen < 6 &&
3042 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3043 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3044 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3045 dst_reg header1_w = header1;
3046 header1_w.writemask = WRITEMASK_W;
3047
3048 emit(MOV(header1, 0u));
3049
3050 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3051 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3052
3053 current_annotation = "Point size";
3054 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3055 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3056 }
3057
3058 if (key->userclip_active) {
3059 current_annotation = "Clipping flags";
3060 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3061 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3062
3063 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3064 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3065 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3066
3067 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3068 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3069 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3070 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3071 }
3072
3073 /* i965 clipping workaround:
3074 * 1) Test for -ve rhw
3075 * 2) If set,
3076 * set ndc = (0,0,0,0)
3077 * set ucp[6] = 1
3078 *
3079 * Later, clipping will detect ucp[6] and ensure the primitive is
3080 * clipped against all fixed planes.
3081 */
3082 if (devinfo->has_negative_rhw_bug) {
3083 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3084 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3085 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3086 vec4_instruction *inst;
3087 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3088 inst->predicate = BRW_PREDICATE_NORMAL;
3089 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3090 inst->predicate = BRW_PREDICATE_NORMAL;
3091 }
3092
3093 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3094 } else if (devinfo->gen < 6) {
3095 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3096 } else {
3097 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3098 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3099 dst_reg reg_w = reg;
3100 reg_w.writemask = WRITEMASK_W;
3101 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3102 }
3103 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3104 dst_reg reg_y = reg;
3105 reg_y.writemask = WRITEMASK_Y;
3106 reg_y.type = BRW_REGISTER_TYPE_D;
3107 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3108 }
3109 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3110 dst_reg reg_z = reg;
3111 reg_z.writemask = WRITEMASK_Z;
3112 reg_z.type = BRW_REGISTER_TYPE_D;
3113 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3114 }
3115 }
3116 }
3117
3118 void
3119 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3120 {
3121 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3122 *
3123 * "If a linked set of shaders forming the vertex stage contains no
3124 * static write to gl_ClipVertex or gl_ClipDistance, but the
3125 * application has requested clipping against user clip planes through
3126 * the API, then the coordinate written to gl_Position is used for
3127 * comparison against the user clip planes."
3128 *
3129 * This function is only called if the shader didn't write to
3130 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3131 * if the user wrote to it; otherwise we use gl_Position.
3132 */
3133 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3134 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3135 clip_vertex = VARYING_SLOT_POS;
3136 }
3137
3138 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3139 ++i) {
3140 reg.writemask = 1 << i;
3141 emit(DP4(reg,
3142 src_reg(output_reg[clip_vertex]),
3143 src_reg(this->userplane[i + offset])));
3144 }
3145 }
3146
3147 vec4_instruction *
3148 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3149 {
3150 assert (varying < VARYING_SLOT_MAX);
3151 reg.type = output_reg[varying].type;
3152 current_annotation = output_reg_annotation[varying];
3153 /* Copy the register, saturating if necessary */
3154 return emit(MOV(reg, src_reg(output_reg[varying])));
3155 }
3156
3157 void
3158 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3159 {
3160 reg.type = BRW_REGISTER_TYPE_F;
3161
3162 switch (varying) {
3163 case VARYING_SLOT_PSIZ:
3164 {
3165 /* PSIZ is always in slot 0, and is coupled with other flags. */
3166 current_annotation = "indices, point width, clip flags";
3167 emit_psiz_and_flags(reg);
3168 break;
3169 }
3170 case BRW_VARYING_SLOT_NDC:
3171 current_annotation = "NDC";
3172 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3173 break;
3174 case VARYING_SLOT_POS:
3175 current_annotation = "gl_Position";
3176 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3177 break;
3178 case VARYING_SLOT_EDGE:
3179 /* This is present when doing unfilled polygons. We're supposed to copy
3180 * the edge flag from the user-provided vertex array
3181 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3182 * of that attribute (starts as 1.0f). This is then used in clipping to
3183 * determine which edges should be drawn as wireframe.
3184 */
3185 current_annotation = "edge flag";
3186 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3187 glsl_type::float_type, WRITEMASK_XYZW))));
3188 break;
3189 case BRW_VARYING_SLOT_PAD:
3190 /* No need to write to this slot */
3191 break;
3192 case VARYING_SLOT_COL0:
3193 case VARYING_SLOT_COL1:
3194 case VARYING_SLOT_BFC0:
3195 case VARYING_SLOT_BFC1: {
3196 /* These built-in varyings are only supported in compatibility mode,
3197 * and we only support GS in core profile. So, this must be a vertex
3198 * shader.
3199 */
3200 assert(stage == MESA_SHADER_VERTEX);
3201 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3202 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3203 inst->saturate = true;
3204 break;
3205 }
3206
3207 default:
3208 emit_generic_urb_slot(reg, varying);
3209 break;
3210 }
3211 }
3212
3213 static int
3214 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3215 {
3216 if (devinfo->gen >= 6) {
3217 /* URB data written (does not include the message header reg) must
3218 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3219 * section 5.4.3.2.2: URB_INTERLEAVED.
3220 *
3221 * URB entries are allocated on a multiple of 1024 bits, so an
3222 * extra 128 bits written here to make the end align to 256 is
3223 * no problem.
3224 */
3225 if ((mlen % 2) != 1)
3226 mlen++;
3227 }
3228
3229 return mlen;
3230 }
3231
3232
3233 /**
3234 * Generates the VUE payload plus the necessary URB write instructions to
3235 * output it.
3236 *
3237 * The VUE layout is documented in Volume 2a.
3238 */
3239 void
3240 vec4_visitor::emit_vertex()
3241 {
3242 /* MRF 0 is reserved for the debugger, so start with message header
3243 * in MRF 1.
3244 */
3245 int base_mrf = 1;
3246 int mrf = base_mrf;
3247 /* In the process of generating our URB write message contents, we
3248 * may need to unspill a register or load from an array. Those
3249 * reads would use MRFs 14-15.
3250 */
3251 int max_usable_mrf = 13;
3252
3253 /* The following assertion verifies that max_usable_mrf causes an
3254 * even-numbered amount of URB write data, which will meet gen6's
3255 * requirements for length alignment.
3256 */
3257 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3258
3259 /* First mrf is the g0-based message header containing URB handles and
3260 * such.
3261 */
3262 emit_urb_write_header(mrf++);
3263
3264 if (devinfo->gen < 6) {
3265 emit_ndc_computation();
3266 }
3267
3268 /* Lower legacy ff and ClipVertex clipping to clip distances */
3269 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3270 current_annotation = "user clip distances";
3271
3272 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3273 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3274
3275 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3276 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3277 }
3278
3279 /* We may need to split this up into several URB writes, so do them in a
3280 * loop.
3281 */
3282 int slot = 0;
3283 bool complete = false;
3284 do {
3285 /* URB offset is in URB row increments, and each of our MRFs is half of
3286 * one of those, since we're doing interleaved writes.
3287 */
3288 int offset = slot / 2;
3289
3290 mrf = base_mrf + 1;
3291 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3292 emit_urb_slot(dst_reg(MRF, mrf++),
3293 prog_data->vue_map.slot_to_varying[slot]);
3294
3295 /* If this was max_usable_mrf, we can't fit anything more into this
3296 * URB WRITE.
3297 */
3298 if (mrf > max_usable_mrf) {
3299 slot++;
3300 break;
3301 }
3302 }
3303
3304 complete = slot >= prog_data->vue_map.num_slots;
3305 current_annotation = "URB write";
3306 vec4_instruction *inst = emit_urb_write_opcode(complete);
3307 inst->base_mrf = base_mrf;
3308 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3309 inst->offset += offset;
3310 } while(!complete);
3311 }
3312
3313
3314 src_reg
3315 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3316 src_reg *reladdr, int reg_offset)
3317 {
3318 /* Because we store the values to scratch interleaved like our
3319 * vertex data, we need to scale the vec4 index by 2.
3320 */
3321 int message_header_scale = 2;
3322
3323 /* Pre-gen6, the message header uses byte offsets instead of vec4
3324 * (16-byte) offset units.
3325 */
3326 if (devinfo->gen < 6)
3327 message_header_scale *= 16;
3328
3329 if (reladdr) {
3330 src_reg index = src_reg(this, glsl_type::int_type);
3331
3332 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3333 src_reg(reg_offset)));
3334 emit_before(block, inst, MUL(dst_reg(index), index,
3335 src_reg(message_header_scale)));
3336
3337 return index;
3338 } else {
3339 return src_reg(reg_offset * message_header_scale);
3340 }
3341 }
3342
3343 src_reg
3344 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3345 src_reg *reladdr, int reg_offset)
3346 {
3347 if (reladdr) {
3348 src_reg index = src_reg(this, glsl_type::int_type);
3349
3350 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3351 src_reg(reg_offset)));
3352
3353 /* Pre-gen6, the message header uses byte offsets instead of vec4
3354 * (16-byte) offset units.
3355 */
3356 if (devinfo->gen < 6) {
3357 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3358 }
3359
3360 return index;
3361 } else if (devinfo->gen >= 8) {
3362 /* Store the offset in a GRF so we can send-from-GRF. */
3363 src_reg offset = src_reg(this, glsl_type::int_type);
3364 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3365 return offset;
3366 } else {
3367 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3368 return src_reg(reg_offset * message_header_scale);
3369 }
3370 }
3371
3372 /**
3373 * Emits an instruction before @inst to load the value named by @orig_src
3374 * from scratch space at @base_offset to @temp.
3375 *
3376 * @base_offset is measured in 32-byte units (the size of a register).
3377 */
3378 void
3379 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3380 dst_reg temp, src_reg orig_src,
3381 int base_offset)
3382 {
3383 int reg_offset = base_offset + orig_src.reg_offset;
3384 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3385 reg_offset);
3386
3387 emit_before(block, inst, SCRATCH_READ(temp, index));
3388 }
3389
3390 /**
3391 * Emits an instruction after @inst to store the value to be written
3392 * to @orig_dst to scratch space at @base_offset, from @temp.
3393 *
3394 * @base_offset is measured in 32-byte units (the size of a register).
3395 */
3396 void
3397 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3398 int base_offset)
3399 {
3400 int reg_offset = base_offset + inst->dst.reg_offset;
3401 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3402 reg_offset);
3403
3404 /* Create a temporary register to store *inst's result in.
3405 *
3406 * We have to be careful in MOVing from our temporary result register in
3407 * the scratch write. If we swizzle from channels of the temporary that
3408 * weren't initialized, it will confuse live interval analysis, which will
3409 * make spilling fail to make progress.
3410 */
3411 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3412 inst->dst.type),
3413 brw_swizzle_for_mask(inst->dst.writemask));
3414 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3415 inst->dst.writemask));
3416 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3417 write->predicate = inst->predicate;
3418 write->ir = inst->ir;
3419 write->annotation = inst->annotation;
3420 inst->insert_after(block, write);
3421
3422 inst->dst.file = temp.file;
3423 inst->dst.reg = temp.reg;
3424 inst->dst.reg_offset = temp.reg_offset;
3425 inst->dst.reladdr = NULL;
3426 }
3427
3428 /**
3429 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3430 * adds the scratch read(s) before \p inst. The function also checks for
3431 * recursive reladdr scratch accesses, issuing the corresponding scratch
3432 * loads and rewriting reladdr references accordingly.
3433 *
3434 * \return \p src if it did not require a scratch load, otherwise, the
3435 * register holding the result of the scratch load that the caller should
3436 * use to rewrite src.
3437 */
3438 src_reg
3439 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3440 vec4_instruction *inst, src_reg src)
3441 {
3442 /* Resolve recursive reladdr scratch access by calling ourselves
3443 * with src.reladdr
3444 */
3445 if (src.reladdr)
3446 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3447 *src.reladdr);
3448
3449 /* Now handle scratch access on src */
3450 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3451 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3452 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3453 src.reg = temp.reg;
3454 src.reg_offset = temp.reg_offset;
3455 src.reladdr = NULL;
3456 }
3457
3458 return src;
3459 }
3460
3461 /**
3462 * We can't generally support array access in GRF space, because a
3463 * single instruction's destination can only span 2 contiguous
3464 * registers. So, we send all GRF arrays that get variable index
3465 * access to scratch space.
3466 */
3467 void
3468 vec4_visitor::move_grf_array_access_to_scratch()
3469 {
3470 int scratch_loc[this->alloc.count];
3471 memset(scratch_loc, -1, sizeof(scratch_loc));
3472
3473 /* First, calculate the set of virtual GRFs that need to be punted
3474 * to scratch due to having any array access on them, and where in
3475 * scratch.
3476 */
3477 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3478 if (inst->dst.file == GRF && inst->dst.reladdr) {
3479 if (scratch_loc[inst->dst.reg] == -1) {
3480 scratch_loc[inst->dst.reg] = c->last_scratch;
3481 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3482 }
3483
3484 for (src_reg *iter = inst->dst.reladdr;
3485 iter->reladdr;
3486 iter = iter->reladdr) {
3487 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3488 scratch_loc[iter->reg] = c->last_scratch;
3489 c->last_scratch += this->alloc.sizes[iter->reg];
3490 }
3491 }
3492 }
3493
3494 for (int i = 0 ; i < 3; i++) {
3495 for (src_reg *iter = &inst->src[i];
3496 iter->reladdr;
3497 iter = iter->reladdr) {
3498 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3499 scratch_loc[iter->reg] = c->last_scratch;
3500 c->last_scratch += this->alloc.sizes[iter->reg];
3501 }
3502 }
3503 }
3504 }
3505
3506 /* Now, for anything that will be accessed through scratch, rewrite
3507 * it to load/store. Note that this is a _safe list walk, because
3508 * we may generate a new scratch_write instruction after the one
3509 * we're processing.
3510 */
3511 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3512 /* Set up the annotation tracking for new generated instructions. */
3513 base_ir = inst->ir;
3514 current_annotation = inst->annotation;
3515
3516 /* First handle scratch access on the dst. Notice we have to handle
3517 * the case where the dst's reladdr also points to scratch space.
3518 */
3519 if (inst->dst.reladdr)
3520 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3521 *inst->dst.reladdr);
3522
3523 /* Now that we have handled any (possibly recursive) reladdr scratch
3524 * accesses for dst we can safely do the scratch write for dst itself
3525 */
3526 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3527 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3528
3529 /* Now handle scratch access on any src. In this case, since inst->src[i]
3530 * already is a src_reg, we can just call emit_resolve_reladdr with
3531 * inst->src[i] and it will take care of handling scratch loads for
3532 * both src and src.reladdr (recursively).
3533 */
3534 for (int i = 0 ; i < 3; i++) {
3535 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3536 inst->src[i]);
3537 }
3538 }
3539 }
3540
3541 /**
3542 * Emits an instruction before @inst to load the value named by @orig_src
3543 * from the pull constant buffer (surface) at @base_offset to @temp.
3544 */
3545 void
3546 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3547 dst_reg temp, src_reg orig_src,
3548 int base_offset)
3549 {
3550 int reg_offset = base_offset + orig_src.reg_offset;
3551 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3552 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3553 reg_offset);
3554
3555 emit_pull_constant_load_reg(temp,
3556 index,
3557 offset,
3558 block, inst);
3559 }
3560
3561 /**
3562 * Implements array access of uniforms by inserting a
3563 * PULL_CONSTANT_LOAD instruction.
3564 *
3565 * Unlike temporary GRF array access (where we don't support it due to
3566 * the difficulty of doing relative addressing on instruction
3567 * destinations), we could potentially do array access of uniforms
3568 * that were loaded in GRF space as push constants. In real-world
3569 * usage we've seen, though, the arrays being used are always larger
3570 * than we could load as push constants, so just always move all
3571 * uniform array access out to a pull constant buffer.
3572 */
3573 void
3574 vec4_visitor::move_uniform_array_access_to_pull_constants()
3575 {
3576 int pull_constant_loc[this->uniforms];
3577 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3578 bool nested_reladdr;
3579
3580 /* Walk through and find array access of uniforms. Put a copy of that
3581 * uniform in the pull constant buffer.
3582 *
3583 * Note that we don't move constant-indexed accesses to arrays. No
3584 * testing has been done of the performance impact of this choice.
3585 */
3586 do {
3587 nested_reladdr = false;
3588
3589 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3590 for (int i = 0 ; i < 3; i++) {
3591 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3592 continue;
3593
3594 int uniform = inst->src[i].reg;
3595
3596 if (inst->src[i].reladdr->reladdr)
3597 nested_reladdr = true; /* will need another pass */
3598
3599 /* If this array isn't already present in the pull constant buffer,
3600 * add it.
3601 */
3602 if (pull_constant_loc[uniform] == -1) {
3603 const gl_constant_value **values =
3604 &stage_prog_data->param[uniform * 4];
3605
3606 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3607
3608 assert(uniform < uniform_array_size);
3609 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3610 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3611 = values[j];
3612 }
3613 }
3614
3615 /* Set up the annotation tracking for new generated instructions. */
3616 base_ir = inst->ir;
3617 current_annotation = inst->annotation;
3618
3619 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3620
3621 emit_pull_constant_load(block, inst, temp, inst->src[i],
3622 pull_constant_loc[uniform]);
3623
3624 inst->src[i].file = temp.file;
3625 inst->src[i].reg = temp.reg;
3626 inst->src[i].reg_offset = temp.reg_offset;
3627 inst->src[i].reladdr = NULL;
3628 }
3629 }
3630 } while (nested_reladdr);
3631
3632 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3633 * no need to track them as larger-than-vec4 objects. This will be
3634 * relied on in cutting out unused uniform vectors from push
3635 * constants.
3636 */
3637 split_uniform_registers();
3638 }
3639
3640 void
3641 vec4_visitor::resolve_ud_negate(src_reg *reg)
3642 {
3643 if (reg->type != BRW_REGISTER_TYPE_UD ||
3644 !reg->negate)
3645 return;
3646
3647 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3648 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3649 *reg = temp;
3650 }
3651
3652 /**
3653 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3654 *
3655 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3656 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3657 */
3658 void
3659 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3660 {
3661 assert(devinfo->gen <= 5);
3662
3663 if (!rvalue->type->is_boolean())
3664 return;
3665
3666 src_reg and_result = src_reg(this, rvalue->type);
3667 src_reg neg_result = src_reg(this, rvalue->type);
3668 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3669 emit(MOV(dst_reg(neg_result), negate(and_result)));
3670 *reg = neg_result;
3671 }
3672
3673 vec4_visitor::vec4_visitor(struct brw_context *brw,
3674 struct brw_vec4_compile *c,
3675 struct gl_program *prog,
3676 const struct brw_vue_prog_key *key,
3677 struct brw_vue_prog_data *prog_data,
3678 struct gl_shader_program *shader_prog,
3679 gl_shader_stage stage,
3680 void *mem_ctx,
3681 bool no_spills,
3682 shader_time_shader_type st_base,
3683 shader_time_shader_type st_written,
3684 shader_time_shader_type st_reset)
3685 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3686 c(c),
3687 key(key),
3688 prog_data(prog_data),
3689 sanity_param_count(0),
3690 fail_msg(NULL),
3691 first_non_payload_grf(0),
3692 need_all_constants_in_pull_buffer(false),
3693 no_spills(no_spills),
3694 st_base(st_base),
3695 st_written(st_written),
3696 st_reset(st_reset)
3697 {
3698 this->mem_ctx = mem_ctx;
3699 this->failed = false;
3700
3701 this->base_ir = NULL;
3702 this->current_annotation = NULL;
3703 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3704
3705 this->variable_ht = hash_table_ctor(0,
3706 hash_table_pointer_hash,
3707 hash_table_pointer_compare);
3708
3709 this->virtual_grf_start = NULL;
3710 this->virtual_grf_end = NULL;
3711 this->live_intervals = NULL;
3712
3713 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3714
3715 this->uniforms = 0;
3716
3717 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3718 * at least one. See setup_uniforms() in brw_vec4.cpp.
3719 */
3720 this->uniform_array_size = 1;
3721 if (prog_data) {
3722 this->uniform_array_size =
3723 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3724 }
3725
3726 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3727 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3728 }
3729
3730 vec4_visitor::~vec4_visitor()
3731 {
3732 hash_table_dtor(this->variable_ht);
3733 }
3734
3735
3736 void
3737 vec4_visitor::fail(const char *format, ...)
3738 {
3739 va_list va;
3740 char *msg;
3741
3742 if (failed)
3743 return;
3744
3745 failed = true;
3746
3747 va_start(va, format);
3748 msg = ralloc_vasprintf(mem_ctx, format, va);
3749 va_end(va);
3750 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3751
3752 this->fail_msg = msg;
3753
3754 if (debug_enabled) {
3755 fprintf(stderr, "%s", msg);
3756 }
3757 }
3758
3759 } /* namespace brw */