Merge remote-tracking branch 'mesa-public/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 case GLSL_TYPE_FUNCTION:
619 unreachable("not reached");
620 }
621
622 return 0;
623 }
624
625 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
626 {
627 init();
628
629 this->file = GRF;
630 this->reg = v->alloc.allocate(type_size(type));
631
632 if (type->is_array() || type->is_record()) {
633 this->swizzle = BRW_SWIZZLE_NOOP;
634 } else {
635 this->swizzle = brw_swizzle_for_size(type->vector_elements);
636 }
637
638 this->type = brw_type_for_base_type(type);
639 }
640
641 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
642 {
643 assert(size > 0);
644
645 init();
646
647 this->file = GRF;
648 this->reg = v->alloc.allocate(type_size(type) * size);
649
650 this->swizzle = BRW_SWIZZLE_NOOP;
651
652 this->type = brw_type_for_base_type(type);
653 }
654
655 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
656 {
657 init();
658
659 this->file = GRF;
660 this->reg = v->alloc.allocate(type_size(type));
661
662 if (type->is_array() || type->is_record()) {
663 this->writemask = WRITEMASK_XYZW;
664 } else {
665 this->writemask = (1 << type->vector_elements) - 1;
666 }
667
668 this->type = brw_type_for_base_type(type);
669 }
670
671 /* Our support for uniforms is piggy-backed on the struct
672 * gl_fragment_program, because that's where the values actually
673 * get stored, rather than in some global gl_shader_program uniform
674 * store.
675 */
676 void
677 vec4_visitor::setup_uniform_values(ir_variable *ir)
678 {
679 int namelen = strlen(ir->name);
680
681 /* The data for our (non-builtin) uniforms is stored in a series of
682 * gl_uniform_driver_storage structs for each subcomponent that
683 * glGetUniformLocation() could name. We know it's been set up in the same
684 * order we'd walk the type, so walk the list of storage and find anything
685 * with our name, or the prefix of a component that starts with our name.
686 */
687 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
688 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
689
690 if (storage->builtin)
691 continue;
692
693 if (strncmp(ir->name, storage->name, namelen) != 0 ||
694 (storage->name[namelen] != 0 &&
695 storage->name[namelen] != '.' &&
696 storage->name[namelen] != '[')) {
697 continue;
698 }
699
700 gl_constant_value *components = storage->storage;
701 unsigned vector_count = (MAX2(storage->array_elements, 1) *
702 storage->type->matrix_columns);
703
704 for (unsigned s = 0; s < vector_count; s++) {
705 assert(uniforms < uniform_array_size);
706 uniform_vector_size[uniforms] = storage->type->vector_elements;
707
708 int i;
709 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
710 stage_prog_data->param[uniforms * 4 + i] = components;
711 components++;
712 }
713 for (; i < 4; i++) {
714 static gl_constant_value zero = { 0.0 };
715 stage_prog_data->param[uniforms * 4 + i] = &zero;
716 }
717
718 uniforms++;
719 }
720 }
721 }
722
723 void
724 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
725 {
726 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
727 assert(this->uniforms < uniform_array_size);
728 this->uniform_vector_size[this->uniforms] = 4;
729 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
730 this->userplane[i].type = BRW_REGISTER_TYPE_F;
731 for (int j = 0; j < 4; ++j) {
732 stage_prog_data->param[this->uniforms * 4 + j] =
733 (gl_constant_value *) &clip_planes[i][j];
734 }
735 ++this->uniforms;
736 }
737 }
738
739 /* Our support for builtin uniforms is even scarier than non-builtin.
740 * It sits on top of the PROG_STATE_VAR parameters that are
741 * automatically updated from GL context state.
742 */
743 void
744 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
745 {
746 const ir_state_slot *const slots = ir->get_state_slots();
747 assert(slots != NULL);
748
749 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
750 /* This state reference has already been setup by ir_to_mesa,
751 * but we'll get the same index back here. We can reference
752 * ParameterValues directly, since unlike brw_fs.cpp, we never
753 * add new state references during compile.
754 */
755 int index = _mesa_add_state_reference(this->prog->Parameters,
756 (gl_state_index *)slots[i].tokens);
757 gl_constant_value *values =
758 &this->prog->Parameters->ParameterValues[index][0];
759
760 assert(this->uniforms < uniform_array_size);
761
762 for (unsigned j = 0; j < 4; j++)
763 stage_prog_data->param[this->uniforms * 4 + j] =
764 &values[GET_SWZ(slots[i].swizzle, j)];
765
766 this->uniform_vector_size[this->uniforms] =
767 (ir->type->is_scalar() || ir->type->is_vector() ||
768 ir->type->is_matrix() ? ir->type->vector_elements : 4);
769
770 this->uniforms++;
771 }
772 }
773
774 dst_reg *
775 vec4_visitor::variable_storage(ir_variable *var)
776 {
777 return (dst_reg *)hash_table_find(this->variable_ht, var);
778 }
779
780 void
781 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
782 enum brw_predicate *predicate)
783 {
784 ir_expression *expr = ir->as_expression();
785
786 *predicate = BRW_PREDICATE_NORMAL;
787
788 if (expr && expr->operation != ir_binop_ubo_load) {
789 src_reg op[3];
790 vec4_instruction *inst;
791
792 assert(expr->get_num_operands() <= 3);
793 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
794 expr->operands[i]->accept(this);
795 op[i] = this->result;
796
797 resolve_ud_negate(&op[i]);
798 }
799
800 switch (expr->operation) {
801 case ir_unop_logic_not:
802 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
803 inst->conditional_mod = BRW_CONDITIONAL_Z;
804 break;
805
806 case ir_binop_logic_xor:
807 if (devinfo->gen <= 5) {
808 src_reg temp = src_reg(this, ir->type);
809 emit(XOR(dst_reg(temp), op[0], op[1]));
810 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
811 } else {
812 inst = emit(XOR(dst_null_d(), op[0], op[1]));
813 }
814 inst->conditional_mod = BRW_CONDITIONAL_NZ;
815 break;
816
817 case ir_binop_logic_or:
818 if (devinfo->gen <= 5) {
819 src_reg temp = src_reg(this, ir->type);
820 emit(OR(dst_reg(temp), op[0], op[1]));
821 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
822 } else {
823 inst = emit(OR(dst_null_d(), op[0], op[1]));
824 }
825 inst->conditional_mod = BRW_CONDITIONAL_NZ;
826 break;
827
828 case ir_binop_logic_and:
829 if (devinfo->gen <= 5) {
830 src_reg temp = src_reg(this, ir->type);
831 emit(AND(dst_reg(temp), op[0], op[1]));
832 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
833 } else {
834 inst = emit(AND(dst_null_d(), op[0], op[1]));
835 }
836 inst->conditional_mod = BRW_CONDITIONAL_NZ;
837 break;
838
839 case ir_unop_f2b:
840 if (devinfo->gen >= 6) {
841 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
842 } else {
843 inst = emit(MOV(dst_null_f(), op[0]));
844 inst->conditional_mod = BRW_CONDITIONAL_NZ;
845 }
846 break;
847
848 case ir_unop_i2b:
849 if (devinfo->gen >= 6) {
850 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
851 } else {
852 inst = emit(MOV(dst_null_d(), op[0]));
853 inst->conditional_mod = BRW_CONDITIONAL_NZ;
854 }
855 break;
856
857 case ir_binop_all_equal:
858 if (devinfo->gen <= 5) {
859 resolve_bool_comparison(expr->operands[0], &op[0]);
860 resolve_bool_comparison(expr->operands[1], &op[1]);
861 }
862 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
863 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
864 break;
865
866 case ir_binop_any_nequal:
867 if (devinfo->gen <= 5) {
868 resolve_bool_comparison(expr->operands[0], &op[0]);
869 resolve_bool_comparison(expr->operands[1], &op[1]);
870 }
871 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
872 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
873 break;
874
875 case ir_unop_any:
876 if (devinfo->gen <= 5) {
877 resolve_bool_comparison(expr->operands[0], &op[0]);
878 }
879 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
880 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
881 break;
882
883 case ir_binop_greater:
884 case ir_binop_gequal:
885 case ir_binop_less:
886 case ir_binop_lequal:
887 case ir_binop_equal:
888 case ir_binop_nequal:
889 if (devinfo->gen <= 5) {
890 resolve_bool_comparison(expr->operands[0], &op[0]);
891 resolve_bool_comparison(expr->operands[1], &op[1]);
892 }
893 emit(CMP(dst_null_d(), op[0], op[1],
894 brw_conditional_for_comparison(expr->operation)));
895 break;
896
897 case ir_triop_csel: {
898 /* Expand the boolean condition into the flag register. */
899 inst = emit(MOV(dst_null_d(), op[0]));
900 inst->conditional_mod = BRW_CONDITIONAL_NZ;
901
902 /* Select which boolean to return. */
903 dst_reg temp(this, expr->operands[1]->type);
904 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
905 inst->predicate = BRW_PREDICATE_NORMAL;
906
907 /* Expand the result to a condition code. */
908 inst = emit(MOV(dst_null_d(), src_reg(temp)));
909 inst->conditional_mod = BRW_CONDITIONAL_NZ;
910 break;
911 }
912
913 default:
914 unreachable("not reached");
915 }
916 return;
917 }
918
919 ir->accept(this);
920
921 resolve_ud_negate(&this->result);
922
923 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
924 inst->conditional_mod = BRW_CONDITIONAL_NZ;
925 }
926
927 /**
928 * Emit a gen6 IF statement with the comparison folded into the IF
929 * instruction.
930 */
931 void
932 vec4_visitor::emit_if_gen6(ir_if *ir)
933 {
934 ir_expression *expr = ir->condition->as_expression();
935
936 if (expr && expr->operation != ir_binop_ubo_load) {
937 src_reg op[3];
938 dst_reg temp;
939
940 assert(expr->get_num_operands() <= 3);
941 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
942 expr->operands[i]->accept(this);
943 op[i] = this->result;
944 }
945
946 switch (expr->operation) {
947 case ir_unop_logic_not:
948 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
949 return;
950
951 case ir_binop_logic_xor:
952 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
953 return;
954
955 case ir_binop_logic_or:
956 temp = dst_reg(this, glsl_type::bool_type);
957 emit(OR(temp, op[0], op[1]));
958 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
959 return;
960
961 case ir_binop_logic_and:
962 temp = dst_reg(this, glsl_type::bool_type);
963 emit(AND(temp, op[0], op[1]));
964 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
965 return;
966
967 case ir_unop_f2b:
968 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
969 return;
970
971 case ir_unop_i2b:
972 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
973 return;
974
975 case ir_binop_greater:
976 case ir_binop_gequal:
977 case ir_binop_less:
978 case ir_binop_lequal:
979 case ir_binop_equal:
980 case ir_binop_nequal:
981 emit(IF(op[0], op[1],
982 brw_conditional_for_comparison(expr->operation)));
983 return;
984
985 case ir_binop_all_equal:
986 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
987 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
988 return;
989
990 case ir_binop_any_nequal:
991 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
992 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
993 return;
994
995 case ir_unop_any:
996 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
997 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
998 return;
999
1000 case ir_triop_csel: {
1001 /* Expand the boolean condition into the flag register. */
1002 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1003 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1004
1005 /* Select which boolean to return. */
1006 dst_reg temp(this, expr->operands[1]->type);
1007 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1008 inst->predicate = BRW_PREDICATE_NORMAL;
1009
1010 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1011 return;
1012 }
1013
1014 default:
1015 unreachable("not reached");
1016 }
1017 return;
1018 }
1019
1020 ir->condition->accept(this);
1021
1022 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1023 }
1024
1025 void
1026 vec4_visitor::visit(ir_variable *ir)
1027 {
1028 dst_reg *reg = NULL;
1029
1030 if (variable_storage(ir))
1031 return;
1032
1033 switch (ir->data.mode) {
1034 case ir_var_shader_in:
1035 assert(ir->data.location != -1);
1036 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1037 break;
1038
1039 case ir_var_shader_out:
1040 assert(ir->data.location != -1);
1041 reg = new(mem_ctx) dst_reg(this, ir->type);
1042
1043 for (int i = 0; i < type_size(ir->type); i++) {
1044 output_reg[ir->data.location + i] = *reg;
1045 output_reg[ir->data.location + i].reg_offset = i;
1046 output_reg[ir->data.location + i].type =
1047 brw_type_for_base_type(ir->type->get_scalar_type());
1048 output_reg_annotation[ir->data.location + i] = ir->name;
1049 }
1050 break;
1051
1052 case ir_var_auto:
1053 case ir_var_temporary:
1054 reg = new(mem_ctx) dst_reg(this, ir->type);
1055 break;
1056
1057 case ir_var_uniform:
1058 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1059
1060 /* Thanks to the lower_ubo_reference pass, we will see only
1061 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1062 * variables, so no need for them to be in variable_ht.
1063 *
1064 * Some uniforms, such as samplers and atomic counters, have no actual
1065 * storage, so we should ignore them.
1066 */
1067 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1068 return;
1069
1070 /* Track how big the whole uniform variable is, in case we need to put a
1071 * copy of its data into pull constants for array access.
1072 */
1073 assert(this->uniforms < uniform_array_size);
1074 this->uniform_size[this->uniforms] = type_size(ir->type);
1075
1076 if (!strncmp(ir->name, "gl_", 3)) {
1077 setup_builtin_uniform_values(ir);
1078 } else {
1079 setup_uniform_values(ir);
1080 }
1081 break;
1082
1083 case ir_var_system_value:
1084 reg = make_reg_for_system_value(ir);
1085 break;
1086
1087 default:
1088 unreachable("not reached");
1089 }
1090
1091 reg->type = brw_type_for_base_type(ir->type);
1092 hash_table_insert(this->variable_ht, reg, ir);
1093 }
1094
1095 void
1096 vec4_visitor::visit(ir_loop *ir)
1097 {
1098 /* We don't want debugging output to print the whole body of the
1099 * loop as the annotation.
1100 */
1101 this->base_ir = NULL;
1102
1103 emit(BRW_OPCODE_DO);
1104
1105 visit_instructions(&ir->body_instructions);
1106
1107 emit(BRW_OPCODE_WHILE);
1108 }
1109
1110 void
1111 vec4_visitor::visit(ir_loop_jump *ir)
1112 {
1113 switch (ir->mode) {
1114 case ir_loop_jump::jump_break:
1115 emit(BRW_OPCODE_BREAK);
1116 break;
1117 case ir_loop_jump::jump_continue:
1118 emit(BRW_OPCODE_CONTINUE);
1119 break;
1120 }
1121 }
1122
1123
1124 void
1125 vec4_visitor::visit(ir_function_signature *)
1126 {
1127 unreachable("not reached");
1128 }
1129
1130 void
1131 vec4_visitor::visit(ir_function *ir)
1132 {
1133 /* Ignore function bodies other than main() -- we shouldn't see calls to
1134 * them since they should all be inlined.
1135 */
1136 if (strcmp(ir->name, "main") == 0) {
1137 const ir_function_signature *sig;
1138 exec_list empty;
1139
1140 sig = ir->matching_signature(NULL, &empty, false);
1141
1142 assert(sig);
1143
1144 visit_instructions(&sig->body);
1145 }
1146 }
1147
1148 bool
1149 vec4_visitor::try_emit_mad(ir_expression *ir)
1150 {
1151 /* 3-src instructions were introduced in gen6. */
1152 if (devinfo->gen < 6)
1153 return false;
1154
1155 /* MAD can only handle floating-point data. */
1156 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1157 return false;
1158
1159 ir_rvalue *nonmul;
1160 ir_expression *mul;
1161 bool mul_negate, mul_abs;
1162
1163 for (int i = 0; i < 2; i++) {
1164 mul_negate = false;
1165 mul_abs = false;
1166
1167 mul = ir->operands[i]->as_expression();
1168 nonmul = ir->operands[1 - i];
1169
1170 if (mul && mul->operation == ir_unop_abs) {
1171 mul = mul->operands[0]->as_expression();
1172 mul_abs = true;
1173 } else if (mul && mul->operation == ir_unop_neg) {
1174 mul = mul->operands[0]->as_expression();
1175 mul_negate = true;
1176 }
1177
1178 if (mul && mul->operation == ir_binop_mul)
1179 break;
1180 }
1181
1182 if (!mul || mul->operation != ir_binop_mul)
1183 return false;
1184
1185 nonmul->accept(this);
1186 src_reg src0 = fix_3src_operand(this->result);
1187
1188 mul->operands[0]->accept(this);
1189 src_reg src1 = fix_3src_operand(this->result);
1190 src1.negate ^= mul_negate;
1191 src1.abs = mul_abs;
1192 if (mul_abs)
1193 src1.negate = false;
1194
1195 mul->operands[1]->accept(this);
1196 src_reg src2 = fix_3src_operand(this->result);
1197 src2.abs = mul_abs;
1198 if (mul_abs)
1199 src2.negate = false;
1200
1201 this->result = src_reg(this, ir->type);
1202 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1203
1204 return true;
1205 }
1206
1207 bool
1208 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1209 {
1210 /* This optimization relies on CMP setting the destination to 0 when
1211 * false. Early hardware only sets the least significant bit, and
1212 * leaves the other bits undefined. So we can't use it.
1213 */
1214 if (devinfo->gen < 6)
1215 return false;
1216
1217 ir_expression *const cmp = ir->operands[0]->as_expression();
1218
1219 if (cmp == NULL)
1220 return false;
1221
1222 switch (cmp->operation) {
1223 case ir_binop_less:
1224 case ir_binop_greater:
1225 case ir_binop_lequal:
1226 case ir_binop_gequal:
1227 case ir_binop_equal:
1228 case ir_binop_nequal:
1229 break;
1230
1231 default:
1232 return false;
1233 }
1234
1235 cmp->operands[0]->accept(this);
1236 const src_reg cmp_src0 = this->result;
1237
1238 cmp->operands[1]->accept(this);
1239 const src_reg cmp_src1 = this->result;
1240
1241 this->result = src_reg(this, ir->type);
1242
1243 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1244 brw_conditional_for_comparison(cmp->operation)));
1245
1246 /* If the comparison is false, this->result will just happen to be zero.
1247 */
1248 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1249 this->result, src_reg(1.0f));
1250 inst->predicate = BRW_PREDICATE_NORMAL;
1251 inst->predicate_inverse = true;
1252
1253 return true;
1254 }
1255
1256 void
1257 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1258 src_reg src0, src_reg src1)
1259 {
1260 vec4_instruction *inst;
1261
1262 if (devinfo->gen >= 6) {
1263 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1264 inst->conditional_mod = conditionalmod;
1265 } else {
1266 emit(CMP(dst, src0, src1, conditionalmod));
1267
1268 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1269 inst->predicate = BRW_PREDICATE_NORMAL;
1270 }
1271 }
1272
1273 void
1274 vec4_visitor::emit_lrp(const dst_reg &dst,
1275 const src_reg &x, const src_reg &y, const src_reg &a)
1276 {
1277 if (devinfo->gen >= 6) {
1278 /* Note that the instruction's argument order is reversed from GLSL
1279 * and the IR.
1280 */
1281 emit(LRP(dst,
1282 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1283 } else {
1284 /* Earlier generations don't support three source operations, so we
1285 * need to emit x*(1-a) + y*a.
1286 */
1287 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1288 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1289 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1290 y_times_a.writemask = dst.writemask;
1291 one_minus_a.writemask = dst.writemask;
1292 x_times_one_minus_a.writemask = dst.writemask;
1293
1294 emit(MUL(y_times_a, y, a));
1295 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1296 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1297 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1298 }
1299 }
1300
1301 /**
1302 * Emits the instructions needed to perform a pull constant load. before_block
1303 * and before_inst can be NULL in which case the instruction will be appended
1304 * to the end of the instruction list.
1305 */
1306 void
1307 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1308 src_reg surf_index,
1309 src_reg offset_reg,
1310 bblock_t *before_block,
1311 vec4_instruction *before_inst)
1312 {
1313 assert((before_inst == NULL && before_block == NULL) ||
1314 (before_inst && before_block));
1315
1316 vec4_instruction *pull;
1317
1318 if (devinfo->gen >= 9) {
1319 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1320 src_reg header(this, glsl_type::uvec4_type, 2);
1321
1322 pull = new(mem_ctx)
1323 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1324 dst_reg(header));
1325
1326 if (before_inst)
1327 emit_before(before_block, before_inst, pull);
1328 else
1329 emit(pull);
1330
1331 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1332 offset_reg.type);
1333 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1334
1335 if (before_inst)
1336 emit_before(before_block, before_inst, pull);
1337 else
1338 emit(pull);
1339
1340 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1341 dst,
1342 surf_index,
1343 header);
1344 pull->mlen = 2;
1345 pull->header_size = 1;
1346 } else if (devinfo->gen >= 7) {
1347 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1348
1349 grf_offset.type = offset_reg.type;
1350
1351 pull = MOV(grf_offset, offset_reg);
1352
1353 if (before_inst)
1354 emit_before(before_block, before_inst, pull);
1355 else
1356 emit(pull);
1357
1358 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1359 dst,
1360 surf_index,
1361 src_reg(grf_offset));
1362 pull->mlen = 1;
1363 } else {
1364 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1365 dst,
1366 surf_index,
1367 offset_reg);
1368 pull->base_mrf = 14;
1369 pull->mlen = 1;
1370 }
1371
1372 if (before_inst)
1373 emit_before(before_block, before_inst, pull);
1374 else
1375 emit(pull);
1376 }
1377
1378 void
1379 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1380 {
1381 const src_reg chan_index(this, glsl_type::uint_type);
1382
1383 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1384 ->force_writemask_all = true;
1385 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1386 ->force_writemask_all = true;
1387 }
1388
1389 void
1390 vec4_visitor::visit(ir_expression *ir)
1391 {
1392 unsigned int operand;
1393 src_reg op[ARRAY_SIZE(ir->operands)];
1394 vec4_instruction *inst;
1395
1396 if (ir->operation == ir_binop_add) {
1397 if (try_emit_mad(ir))
1398 return;
1399 }
1400
1401 if (ir->operation == ir_unop_b2f) {
1402 if (try_emit_b2f_of_compare(ir))
1403 return;
1404 }
1405
1406 /* Storage for our result. Ideally for an assignment we'd be using
1407 * the actual storage for the result here, instead.
1408 */
1409 dst_reg result_dst(this, ir->type);
1410 src_reg result_src(result_dst);
1411
1412 if (ir->operation == ir_triop_csel) {
1413 ir->operands[1]->accept(this);
1414 op[1] = this->result;
1415 ir->operands[2]->accept(this);
1416 op[2] = this->result;
1417
1418 enum brw_predicate predicate;
1419 emit_bool_to_cond_code(ir->operands[0], &predicate);
1420 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1421 inst->predicate = predicate;
1422 this->result = result_src;
1423 return;
1424 }
1425
1426 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1427 this->result.file = BAD_FILE;
1428 ir->operands[operand]->accept(this);
1429 if (this->result.file == BAD_FILE) {
1430 fprintf(stderr, "Failed to get tree for expression operand:\n");
1431 ir->operands[operand]->fprint(stderr);
1432 exit(1);
1433 }
1434 op[operand] = this->result;
1435
1436 /* Matrix expression operands should have been broken down to vector
1437 * operations already.
1438 */
1439 assert(!ir->operands[operand]->type->is_matrix());
1440 }
1441
1442 /* If nothing special happens, this is the result. */
1443 this->result = result_src;
1444
1445 switch (ir->operation) {
1446 case ir_unop_logic_not:
1447 emit(NOT(result_dst, op[0]));
1448 break;
1449 case ir_unop_neg:
1450 op[0].negate = !op[0].negate;
1451 emit(MOV(result_dst, op[0]));
1452 break;
1453 case ir_unop_abs:
1454 op[0].abs = true;
1455 op[0].negate = false;
1456 emit(MOV(result_dst, op[0]));
1457 break;
1458
1459 case ir_unop_sign:
1460 if (ir->type->is_float()) {
1461 /* AND(val, 0x80000000) gives the sign bit.
1462 *
1463 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1464 * zero.
1465 */
1466 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1467
1468 op[0].type = BRW_REGISTER_TYPE_UD;
1469 result_dst.type = BRW_REGISTER_TYPE_UD;
1470 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1471
1472 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1473 inst->predicate = BRW_PREDICATE_NORMAL;
1474
1475 this->result.type = BRW_REGISTER_TYPE_F;
1476 } else {
1477 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1478 * -> non-negative val generates 0x00000000.
1479 * Predicated OR sets 1 if val is positive.
1480 */
1481 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1482
1483 emit(ASR(result_dst, op[0], src_reg(31)));
1484
1485 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1486 inst->predicate = BRW_PREDICATE_NORMAL;
1487 }
1488 break;
1489
1490 case ir_unop_rcp:
1491 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1492 break;
1493
1494 case ir_unop_exp2:
1495 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1496 break;
1497 case ir_unop_log2:
1498 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1499 break;
1500 case ir_unop_exp:
1501 case ir_unop_log:
1502 unreachable("not reached: should be handled by ir_explog_to_explog2");
1503 case ir_unop_sin:
1504 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1505 break;
1506 case ir_unop_cos:
1507 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1508 break;
1509
1510 case ir_unop_dFdx:
1511 case ir_unop_dFdx_coarse:
1512 case ir_unop_dFdx_fine:
1513 case ir_unop_dFdy:
1514 case ir_unop_dFdy_coarse:
1515 case ir_unop_dFdy_fine:
1516 unreachable("derivatives not valid in vertex shader");
1517
1518 case ir_unop_bitfield_reverse:
1519 emit(BFREV(result_dst, op[0]));
1520 break;
1521 case ir_unop_bit_count:
1522 emit(CBIT(result_dst, op[0]));
1523 break;
1524 case ir_unop_find_msb: {
1525 src_reg temp = src_reg(this, glsl_type::uint_type);
1526
1527 inst = emit(FBH(dst_reg(temp), op[0]));
1528 inst->dst.writemask = WRITEMASK_XYZW;
1529
1530 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1531 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1532 * subtract the result from 31 to convert the MSB count into an LSB count.
1533 */
1534
1535 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1536 temp.swizzle = BRW_SWIZZLE_NOOP;
1537 emit(MOV(result_dst, temp));
1538
1539 src_reg src_tmp = src_reg(result_dst);
1540 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1541
1542 src_tmp.negate = true;
1543 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1544 inst->predicate = BRW_PREDICATE_NORMAL;
1545 break;
1546 }
1547 case ir_unop_find_lsb:
1548 emit(FBL(result_dst, op[0]));
1549 break;
1550 case ir_unop_saturate:
1551 inst = emit(MOV(result_dst, op[0]));
1552 inst->saturate = true;
1553 break;
1554
1555 case ir_unop_noise:
1556 unreachable("not reached: should be handled by lower_noise");
1557
1558 case ir_binop_add:
1559 emit(ADD(result_dst, op[0], op[1]));
1560 break;
1561 case ir_binop_sub:
1562 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1563
1564 case ir_binop_mul:
1565 if (devinfo->gen < 8 && ir->type->is_integer()) {
1566 /* For integer multiplication, the MUL uses the low 16 bits of one of
1567 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1568 * accumulates in the contribution of the upper 16 bits of that
1569 * operand. If we can determine that one of the args is in the low
1570 * 16 bits, though, we can just emit a single MUL.
1571 */
1572 if (ir->operands[0]->is_uint16_constant()) {
1573 if (devinfo->gen < 7)
1574 emit(MUL(result_dst, op[0], op[1]));
1575 else
1576 emit(MUL(result_dst, op[1], op[0]));
1577 } else if (ir->operands[1]->is_uint16_constant()) {
1578 if (devinfo->gen < 7)
1579 emit(MUL(result_dst, op[1], op[0]));
1580 else
1581 emit(MUL(result_dst, op[0], op[1]));
1582 } else {
1583 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1584
1585 emit(MUL(acc, op[0], op[1]));
1586 emit(MACH(dst_null_d(), op[0], op[1]));
1587 emit(MOV(result_dst, src_reg(acc)));
1588 }
1589 } else {
1590 emit(MUL(result_dst, op[0], op[1]));
1591 }
1592 break;
1593 case ir_binop_imul_high: {
1594 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1595
1596 emit(MUL(acc, op[0], op[1]));
1597 emit(MACH(result_dst, op[0], op[1]));
1598 break;
1599 }
1600 case ir_binop_div:
1601 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1602 assert(ir->type->is_integer());
1603 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1604 break;
1605 case ir_binop_carry: {
1606 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1607
1608 emit(ADDC(dst_null_ud(), op[0], op[1]));
1609 emit(MOV(result_dst, src_reg(acc)));
1610 break;
1611 }
1612 case ir_binop_borrow: {
1613 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1614
1615 emit(SUBB(dst_null_ud(), op[0], op[1]));
1616 emit(MOV(result_dst, src_reg(acc)));
1617 break;
1618 }
1619 case ir_binop_mod:
1620 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1621 assert(ir->type->is_integer());
1622 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1623 break;
1624
1625 case ir_binop_less:
1626 case ir_binop_greater:
1627 case ir_binop_lequal:
1628 case ir_binop_gequal:
1629 case ir_binop_equal:
1630 case ir_binop_nequal: {
1631 if (devinfo->gen <= 5) {
1632 resolve_bool_comparison(ir->operands[0], &op[0]);
1633 resolve_bool_comparison(ir->operands[1], &op[1]);
1634 }
1635 emit(CMP(result_dst, op[0], op[1],
1636 brw_conditional_for_comparison(ir->operation)));
1637 break;
1638 }
1639
1640 case ir_binop_all_equal:
1641 if (devinfo->gen <= 5) {
1642 resolve_bool_comparison(ir->operands[0], &op[0]);
1643 resolve_bool_comparison(ir->operands[1], &op[1]);
1644 }
1645
1646 /* "==" operator producing a scalar boolean. */
1647 if (ir->operands[0]->type->is_vector() ||
1648 ir->operands[1]->type->is_vector()) {
1649 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1650 emit(MOV(result_dst, src_reg(0)));
1651 inst = emit(MOV(result_dst, src_reg(~0)));
1652 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1653 } else {
1654 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1655 }
1656 break;
1657 case ir_binop_any_nequal:
1658 if (devinfo->gen <= 5) {
1659 resolve_bool_comparison(ir->operands[0], &op[0]);
1660 resolve_bool_comparison(ir->operands[1], &op[1]);
1661 }
1662
1663 /* "!=" operator producing a scalar boolean. */
1664 if (ir->operands[0]->type->is_vector() ||
1665 ir->operands[1]->type->is_vector()) {
1666 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1667
1668 emit(MOV(result_dst, src_reg(0)));
1669 inst = emit(MOV(result_dst, src_reg(~0)));
1670 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1671 } else {
1672 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1673 }
1674 break;
1675
1676 case ir_unop_any:
1677 if (devinfo->gen <= 5) {
1678 resolve_bool_comparison(ir->operands[0], &op[0]);
1679 }
1680 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1681 emit(MOV(result_dst, src_reg(0)));
1682
1683 inst = emit(MOV(result_dst, src_reg(~0)));
1684 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1685 break;
1686
1687 case ir_binop_logic_xor:
1688 emit(XOR(result_dst, op[0], op[1]));
1689 break;
1690
1691 case ir_binop_logic_or:
1692 emit(OR(result_dst, op[0], op[1]));
1693 break;
1694
1695 case ir_binop_logic_and:
1696 emit(AND(result_dst, op[0], op[1]));
1697 break;
1698
1699 case ir_binop_dot:
1700 assert(ir->operands[0]->type->is_vector());
1701 assert(ir->operands[0]->type == ir->operands[1]->type);
1702 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1703 break;
1704
1705 case ir_unop_sqrt:
1706 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1707 break;
1708 case ir_unop_rsq:
1709 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1710 break;
1711
1712 case ir_unop_bitcast_i2f:
1713 case ir_unop_bitcast_u2f:
1714 this->result = op[0];
1715 this->result.type = BRW_REGISTER_TYPE_F;
1716 break;
1717
1718 case ir_unop_bitcast_f2i:
1719 this->result = op[0];
1720 this->result.type = BRW_REGISTER_TYPE_D;
1721 break;
1722
1723 case ir_unop_bitcast_f2u:
1724 this->result = op[0];
1725 this->result.type = BRW_REGISTER_TYPE_UD;
1726 break;
1727
1728 case ir_unop_i2f:
1729 case ir_unop_i2u:
1730 case ir_unop_u2i:
1731 case ir_unop_u2f:
1732 case ir_unop_f2i:
1733 case ir_unop_f2u:
1734 emit(MOV(result_dst, op[0]));
1735 break;
1736 case ir_unop_b2i:
1737 emit(AND(result_dst, op[0], src_reg(1)));
1738 break;
1739 case ir_unop_b2f:
1740 if (devinfo->gen <= 5) {
1741 resolve_bool_comparison(ir->operands[0], &op[0]);
1742 }
1743 op[0].type = BRW_REGISTER_TYPE_D;
1744 result_dst.type = BRW_REGISTER_TYPE_D;
1745 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1746 result_dst.type = BRW_REGISTER_TYPE_F;
1747 break;
1748 case ir_unop_f2b:
1749 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1750 break;
1751 case ir_unop_i2b:
1752 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1753 break;
1754
1755 case ir_unop_trunc:
1756 emit(RNDZ(result_dst, op[0]));
1757 break;
1758 case ir_unop_ceil: {
1759 src_reg tmp = src_reg(this, ir->type);
1760 op[0].negate = !op[0].negate;
1761 emit(RNDD(dst_reg(tmp), op[0]));
1762 tmp.negate = true;
1763 emit(MOV(result_dst, tmp));
1764 }
1765 break;
1766 case ir_unop_floor:
1767 inst = emit(RNDD(result_dst, op[0]));
1768 break;
1769 case ir_unop_fract:
1770 inst = emit(FRC(result_dst, op[0]));
1771 break;
1772 case ir_unop_round_even:
1773 emit(RNDE(result_dst, op[0]));
1774 break;
1775
1776 case ir_binop_min:
1777 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1778 break;
1779 case ir_binop_max:
1780 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1781 break;
1782
1783 case ir_binop_pow:
1784 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1785 break;
1786
1787 case ir_unop_bit_not:
1788 inst = emit(NOT(result_dst, op[0]));
1789 break;
1790 case ir_binop_bit_and:
1791 inst = emit(AND(result_dst, op[0], op[1]));
1792 break;
1793 case ir_binop_bit_xor:
1794 inst = emit(XOR(result_dst, op[0], op[1]));
1795 break;
1796 case ir_binop_bit_or:
1797 inst = emit(OR(result_dst, op[0], op[1]));
1798 break;
1799
1800 case ir_binop_lshift:
1801 inst = emit(SHL(result_dst, op[0], op[1]));
1802 break;
1803
1804 case ir_binop_rshift:
1805 if (ir->type->base_type == GLSL_TYPE_INT)
1806 inst = emit(ASR(result_dst, op[0], op[1]));
1807 else
1808 inst = emit(SHR(result_dst, op[0], op[1]));
1809 break;
1810
1811 case ir_binop_bfm:
1812 emit(BFI1(result_dst, op[0], op[1]));
1813 break;
1814
1815 case ir_binop_ubo_load: {
1816 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1817 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1818 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1819 src_reg offset;
1820
1821 /* Now, load the vector from that offset. */
1822 assert(ir->type->is_vector() || ir->type->is_scalar());
1823
1824 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1825 packed_consts.type = result.type;
1826 src_reg surf_index;
1827
1828 if (const_uniform_block) {
1829 /* The block index is a constant, so just emit the binding table entry
1830 * as an immediate.
1831 */
1832 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1833 const_uniform_block->value.u[0]);
1834 } else {
1835 /* The block index is not a constant. Evaluate the index expression
1836 * per-channel and add the base UBO index; we have to select a value
1837 * from any live channel.
1838 */
1839 surf_index = src_reg(this, glsl_type::uint_type);
1840 emit(ADD(dst_reg(surf_index), op[0],
1841 src_reg(prog_data->base.binding_table.ubo_start)));
1842 emit_uniformize(dst_reg(surf_index), surf_index);
1843
1844 /* Assume this may touch any UBO. It would be nice to provide
1845 * a tighter bound, but the array information is already lowered away.
1846 */
1847 brw_mark_surface_used(&prog_data->base,
1848 prog_data->base.binding_table.ubo_start +
1849 shader_prog->NumUniformBlocks - 1);
1850 }
1851
1852 if (const_offset_ir) {
1853 if (devinfo->gen >= 8) {
1854 /* Store the offset in a GRF so we can send-from-GRF. */
1855 offset = src_reg(this, glsl_type::int_type);
1856 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1857 } else {
1858 /* Immediates are fine on older generations since they'll be moved
1859 * to a (potentially fake) MRF at the generator level.
1860 */
1861 offset = src_reg(const_offset / 16);
1862 }
1863 } else {
1864 offset = src_reg(this, glsl_type::uint_type);
1865 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1866 }
1867
1868 emit_pull_constant_load_reg(dst_reg(packed_consts),
1869 surf_index,
1870 offset,
1871 NULL, NULL /* before_block/inst */);
1872
1873 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1874 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1875 const_offset % 16 / 4,
1876 const_offset % 16 / 4,
1877 const_offset % 16 / 4);
1878
1879 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1880 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1881 emit(CMP(result_dst, packed_consts, src_reg(0u),
1882 BRW_CONDITIONAL_NZ));
1883 } else {
1884 emit(MOV(result_dst, packed_consts));
1885 }
1886 break;
1887 }
1888
1889 case ir_binop_vector_extract:
1890 unreachable("should have been lowered by vec_index_to_cond_assign");
1891
1892 case ir_triop_fma:
1893 op[0] = fix_3src_operand(op[0]);
1894 op[1] = fix_3src_operand(op[1]);
1895 op[2] = fix_3src_operand(op[2]);
1896 /* Note that the instruction's argument order is reversed from GLSL
1897 * and the IR.
1898 */
1899 emit(MAD(result_dst, op[2], op[1], op[0]));
1900 break;
1901
1902 case ir_triop_lrp:
1903 emit_lrp(result_dst, op[0], op[1], op[2]);
1904 break;
1905
1906 case ir_triop_csel:
1907 unreachable("already handled above");
1908 break;
1909
1910 case ir_triop_bfi:
1911 op[0] = fix_3src_operand(op[0]);
1912 op[1] = fix_3src_operand(op[1]);
1913 op[2] = fix_3src_operand(op[2]);
1914 emit(BFI2(result_dst, op[0], op[1], op[2]));
1915 break;
1916
1917 case ir_triop_bitfield_extract:
1918 op[0] = fix_3src_operand(op[0]);
1919 op[1] = fix_3src_operand(op[1]);
1920 op[2] = fix_3src_operand(op[2]);
1921 /* Note that the instruction's argument order is reversed from GLSL
1922 * and the IR.
1923 */
1924 emit(BFE(result_dst, op[2], op[1], op[0]));
1925 break;
1926
1927 case ir_triop_vector_insert:
1928 unreachable("should have been lowered by lower_vector_insert");
1929
1930 case ir_quadop_bitfield_insert:
1931 unreachable("not reached: should be handled by "
1932 "bitfield_insert_to_bfm_bfi\n");
1933
1934 case ir_quadop_vector:
1935 unreachable("not reached: should be handled by lower_quadop_vector");
1936
1937 case ir_unop_pack_half_2x16:
1938 emit_pack_half_2x16(result_dst, op[0]);
1939 break;
1940 case ir_unop_unpack_half_2x16:
1941 emit_unpack_half_2x16(result_dst, op[0]);
1942 break;
1943 case ir_unop_unpack_unorm_4x8:
1944 emit_unpack_unorm_4x8(result_dst, op[0]);
1945 break;
1946 case ir_unop_unpack_snorm_4x8:
1947 emit_unpack_snorm_4x8(result_dst, op[0]);
1948 break;
1949 case ir_unop_pack_unorm_4x8:
1950 emit_pack_unorm_4x8(result_dst, op[0]);
1951 break;
1952 case ir_unop_pack_snorm_4x8:
1953 emit_pack_snorm_4x8(result_dst, op[0]);
1954 break;
1955 case ir_unop_pack_snorm_2x16:
1956 case ir_unop_pack_unorm_2x16:
1957 case ir_unop_unpack_snorm_2x16:
1958 case ir_unop_unpack_unorm_2x16:
1959 unreachable("not reached: should be handled by lower_packing_builtins");
1960 case ir_unop_unpack_half_2x16_split_x:
1961 case ir_unop_unpack_half_2x16_split_y:
1962 case ir_binop_pack_half_2x16_split:
1963 case ir_unop_interpolate_at_centroid:
1964 case ir_binop_interpolate_at_sample:
1965 case ir_binop_interpolate_at_offset:
1966 unreachable("not reached: should not occur in vertex shader");
1967 case ir_binop_ldexp:
1968 unreachable("not reached: should be handled by ldexp_to_arith()");
1969 case ir_unop_d2f:
1970 case ir_unop_f2d:
1971 case ir_unop_d2i:
1972 case ir_unop_i2d:
1973 case ir_unop_d2u:
1974 case ir_unop_u2d:
1975 case ir_unop_d2b:
1976 case ir_unop_pack_double_2x32:
1977 case ir_unop_unpack_double_2x32:
1978 case ir_unop_frexp_sig:
1979 case ir_unop_frexp_exp:
1980 unreachable("fp64 todo");
1981 }
1982 }
1983
1984
1985 void
1986 vec4_visitor::visit(ir_swizzle *ir)
1987 {
1988 /* Note that this is only swizzles in expressions, not those on the left
1989 * hand side of an assignment, which do write masking. See ir_assignment
1990 * for that.
1991 */
1992 const unsigned swz = brw_compose_swizzle(
1993 brw_swizzle_for_size(ir->type->vector_elements),
1994 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1995
1996 ir->val->accept(this);
1997 this->result = swizzle(this->result, swz);
1998 }
1999
2000 void
2001 vec4_visitor::visit(ir_dereference_variable *ir)
2002 {
2003 const struct glsl_type *type = ir->type;
2004 dst_reg *reg = variable_storage(ir->var);
2005
2006 if (!reg) {
2007 fail("Failed to find variable storage for %s\n", ir->var->name);
2008 this->result = src_reg(brw_null_reg());
2009 return;
2010 }
2011
2012 this->result = src_reg(*reg);
2013
2014 /* System values get their swizzle from the dst_reg writemask */
2015 if (ir->var->data.mode == ir_var_system_value)
2016 return;
2017
2018 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2019 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2020 }
2021
2022
2023 int
2024 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2025 {
2026 /* Under normal circumstances array elements are stored consecutively, so
2027 * the stride is equal to the size of the array element.
2028 */
2029 return type_size(ir->type);
2030 }
2031
2032
2033 void
2034 vec4_visitor::visit(ir_dereference_array *ir)
2035 {
2036 ir_constant *constant_index;
2037 src_reg src;
2038 int array_stride = compute_array_stride(ir);
2039
2040 constant_index = ir->array_index->constant_expression_value();
2041
2042 ir->array->accept(this);
2043 src = this->result;
2044
2045 if (constant_index) {
2046 src.reg_offset += constant_index->value.i[0] * array_stride;
2047 } else {
2048 /* Variable index array dereference. It eats the "vec4" of the
2049 * base of the array and an index that offsets the Mesa register
2050 * index.
2051 */
2052 ir->array_index->accept(this);
2053
2054 src_reg index_reg;
2055
2056 if (array_stride == 1) {
2057 index_reg = this->result;
2058 } else {
2059 index_reg = src_reg(this, glsl_type::int_type);
2060
2061 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2062 }
2063
2064 if (src.reladdr) {
2065 src_reg temp = src_reg(this, glsl_type::int_type);
2066
2067 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2068
2069 index_reg = temp;
2070 }
2071
2072 src.reladdr = ralloc(mem_ctx, src_reg);
2073 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2074 }
2075
2076 /* If the type is smaller than a vec4, replicate the last channel out. */
2077 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2078 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2079 else
2080 src.swizzle = BRW_SWIZZLE_NOOP;
2081 src.type = brw_type_for_base_type(ir->type);
2082
2083 this->result = src;
2084 }
2085
2086 void
2087 vec4_visitor::visit(ir_dereference_record *ir)
2088 {
2089 unsigned int i;
2090 const glsl_type *struct_type = ir->record->type;
2091 int offset = 0;
2092
2093 ir->record->accept(this);
2094
2095 for (i = 0; i < struct_type->length; i++) {
2096 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2097 break;
2098 offset += type_size(struct_type->fields.structure[i].type);
2099 }
2100
2101 /* If the type is smaller than a vec4, replicate the last channel out. */
2102 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2103 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2104 else
2105 this->result.swizzle = BRW_SWIZZLE_NOOP;
2106 this->result.type = brw_type_for_base_type(ir->type);
2107
2108 this->result.reg_offset += offset;
2109 }
2110
2111 /**
2112 * We want to be careful in assignment setup to hit the actual storage
2113 * instead of potentially using a temporary like we might with the
2114 * ir_dereference handler.
2115 */
2116 static dst_reg
2117 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2118 {
2119 /* The LHS must be a dereference. If the LHS is a variable indexed array
2120 * access of a vector, it must be separated into a series conditional moves
2121 * before reaching this point (see ir_vec_index_to_cond_assign).
2122 */
2123 assert(ir->as_dereference());
2124 ir_dereference_array *deref_array = ir->as_dereference_array();
2125 if (deref_array) {
2126 assert(!deref_array->array->type->is_vector());
2127 }
2128
2129 /* Use the rvalue deref handler for the most part. We'll ignore
2130 * swizzles in it and write swizzles using writemask, though.
2131 */
2132 ir->accept(v);
2133 return dst_reg(v->result);
2134 }
2135
2136 void
2137 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2138 const struct glsl_type *type,
2139 enum brw_predicate predicate)
2140 {
2141 if (type->base_type == GLSL_TYPE_STRUCT) {
2142 for (unsigned int i = 0; i < type->length; i++) {
2143 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2144 }
2145 return;
2146 }
2147
2148 if (type->is_array()) {
2149 for (unsigned int i = 0; i < type->length; i++) {
2150 emit_block_move(dst, src, type->fields.array, predicate);
2151 }
2152 return;
2153 }
2154
2155 if (type->is_matrix()) {
2156 const struct glsl_type *vec_type;
2157
2158 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2159 type->vector_elements, 1);
2160
2161 for (int i = 0; i < type->matrix_columns; i++) {
2162 emit_block_move(dst, src, vec_type, predicate);
2163 }
2164 return;
2165 }
2166
2167 assert(type->is_scalar() || type->is_vector());
2168
2169 dst->type = brw_type_for_base_type(type);
2170 src->type = dst->type;
2171
2172 dst->writemask = (1 << type->vector_elements) - 1;
2173
2174 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2175
2176 vec4_instruction *inst = emit(MOV(*dst, *src));
2177 inst->predicate = predicate;
2178
2179 dst->reg_offset++;
2180 src->reg_offset++;
2181 }
2182
2183
2184 /* If the RHS processing resulted in an instruction generating a
2185 * temporary value, and it would be easy to rewrite the instruction to
2186 * generate its result right into the LHS instead, do so. This ends
2187 * up reliably removing instructions where it can be tricky to do so
2188 * later without real UD chain information.
2189 */
2190 bool
2191 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2192 dst_reg dst,
2193 src_reg src,
2194 vec4_instruction *pre_rhs_inst,
2195 vec4_instruction *last_rhs_inst)
2196 {
2197 /* This could be supported, but it would take more smarts. */
2198 if (ir->condition)
2199 return false;
2200
2201 if (pre_rhs_inst == last_rhs_inst)
2202 return false; /* No instructions generated to work with. */
2203
2204 /* Make sure the last instruction generated our source reg. */
2205 if (src.file != GRF ||
2206 src.file != last_rhs_inst->dst.file ||
2207 src.reg != last_rhs_inst->dst.reg ||
2208 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2209 src.reladdr ||
2210 src.abs ||
2211 src.negate ||
2212 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2213 return false;
2214
2215 /* Check that that last instruction fully initialized the channels
2216 * we want to use, in the order we want to use them. We could
2217 * potentially reswizzle the operands of many instructions so that
2218 * we could handle out of order channels, but don't yet.
2219 */
2220
2221 for (unsigned i = 0; i < 4; i++) {
2222 if (dst.writemask & (1 << i)) {
2223 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2224 return false;
2225
2226 if (BRW_GET_SWZ(src.swizzle, i) != i)
2227 return false;
2228 }
2229 }
2230
2231 /* Success! Rewrite the instruction. */
2232 last_rhs_inst->dst.file = dst.file;
2233 last_rhs_inst->dst.reg = dst.reg;
2234 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2235 last_rhs_inst->dst.reladdr = dst.reladdr;
2236 last_rhs_inst->dst.writemask &= dst.writemask;
2237
2238 return true;
2239 }
2240
2241 void
2242 vec4_visitor::visit(ir_assignment *ir)
2243 {
2244 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2245 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2246
2247 if (!ir->lhs->type->is_scalar() &&
2248 !ir->lhs->type->is_vector()) {
2249 ir->rhs->accept(this);
2250 src_reg src = this->result;
2251
2252 if (ir->condition) {
2253 emit_bool_to_cond_code(ir->condition, &predicate);
2254 }
2255
2256 /* emit_block_move doesn't account for swizzles in the source register.
2257 * This should be ok, since the source register is a structure or an
2258 * array, and those can't be swizzled. But double-check to be sure.
2259 */
2260 assert(src.swizzle ==
2261 (ir->rhs->type->is_matrix()
2262 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2263 : BRW_SWIZZLE_NOOP));
2264
2265 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2266 return;
2267 }
2268
2269 /* Now we're down to just a scalar/vector with writemasks. */
2270 int i;
2271
2272 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2273 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2274
2275 ir->rhs->accept(this);
2276
2277 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2278
2279 int swizzles[4];
2280 int src_chan = 0;
2281
2282 assert(ir->lhs->type->is_vector() ||
2283 ir->lhs->type->is_scalar());
2284 dst.writemask = ir->write_mask;
2285
2286 /* Swizzle a small RHS vector into the channels being written.
2287 *
2288 * glsl ir treats write_mask as dictating how many channels are
2289 * present on the RHS while in our instructions we need to make
2290 * those channels appear in the slots of the vec4 they're written to.
2291 */
2292 for (int i = 0; i < 4; i++)
2293 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2294
2295 src_reg src = swizzle(this->result,
2296 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2297 swizzles[2], swizzles[3]));
2298
2299 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2300 return;
2301 }
2302
2303 if (ir->condition) {
2304 emit_bool_to_cond_code(ir->condition, &predicate);
2305 }
2306
2307 for (i = 0; i < type_size(ir->lhs->type); i++) {
2308 vec4_instruction *inst = emit(MOV(dst, src));
2309 inst->predicate = predicate;
2310
2311 dst.reg_offset++;
2312 src.reg_offset++;
2313 }
2314 }
2315
2316 void
2317 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2318 {
2319 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2320 foreach_in_list(ir_constant, field_value, &ir->components) {
2321 emit_constant_values(dst, field_value);
2322 }
2323 return;
2324 }
2325
2326 if (ir->type->is_array()) {
2327 for (unsigned int i = 0; i < ir->type->length; i++) {
2328 emit_constant_values(dst, ir->array_elements[i]);
2329 }
2330 return;
2331 }
2332
2333 if (ir->type->is_matrix()) {
2334 for (int i = 0; i < ir->type->matrix_columns; i++) {
2335 float *vec = &ir->value.f[i * ir->type->vector_elements];
2336
2337 for (int j = 0; j < ir->type->vector_elements; j++) {
2338 dst->writemask = 1 << j;
2339 dst->type = BRW_REGISTER_TYPE_F;
2340
2341 emit(MOV(*dst, src_reg(vec[j])));
2342 }
2343 dst->reg_offset++;
2344 }
2345 return;
2346 }
2347
2348 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2349
2350 for (int i = 0; i < ir->type->vector_elements; i++) {
2351 if (!(remaining_writemask & (1 << i)))
2352 continue;
2353
2354 dst->writemask = 1 << i;
2355 dst->type = brw_type_for_base_type(ir->type);
2356
2357 /* Find other components that match the one we're about to
2358 * write. Emits fewer instructions for things like vec4(0.5,
2359 * 1.5, 1.5, 1.5).
2360 */
2361 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2362 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2363 if (ir->value.b[i] == ir->value.b[j])
2364 dst->writemask |= (1 << j);
2365 } else {
2366 /* u, i, and f storage all line up, so no need for a
2367 * switch case for comparing each type.
2368 */
2369 if (ir->value.u[i] == ir->value.u[j])
2370 dst->writemask |= (1 << j);
2371 }
2372 }
2373
2374 switch (ir->type->base_type) {
2375 case GLSL_TYPE_FLOAT:
2376 emit(MOV(*dst, src_reg(ir->value.f[i])));
2377 break;
2378 case GLSL_TYPE_INT:
2379 emit(MOV(*dst, src_reg(ir->value.i[i])));
2380 break;
2381 case GLSL_TYPE_UINT:
2382 emit(MOV(*dst, src_reg(ir->value.u[i])));
2383 break;
2384 case GLSL_TYPE_BOOL:
2385 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2386 break;
2387 default:
2388 unreachable("Non-float/uint/int/bool constant");
2389 }
2390
2391 remaining_writemask &= ~dst->writemask;
2392 }
2393 dst->reg_offset++;
2394 }
2395
2396 void
2397 vec4_visitor::visit(ir_constant *ir)
2398 {
2399 dst_reg dst = dst_reg(this, ir->type);
2400 this->result = src_reg(dst);
2401
2402 emit_constant_values(&dst, ir);
2403 }
2404
2405 void
2406 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2407 {
2408 ir_dereference *deref = static_cast<ir_dereference *>(
2409 ir->actual_parameters.get_head());
2410 ir_variable *location = deref->variable_referenced();
2411 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2412 location->data.binding);
2413
2414 /* Calculate the surface offset */
2415 src_reg offset(this, glsl_type::uint_type);
2416 ir_dereference_array *deref_array = deref->as_dereference_array();
2417 if (deref_array) {
2418 deref_array->array_index->accept(this);
2419
2420 src_reg tmp(this, glsl_type::uint_type);
2421 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2422 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2423 } else {
2424 offset = location->data.atomic.offset;
2425 }
2426
2427 /* Emit the appropriate machine instruction */
2428 const char *callee = ir->callee->function_name();
2429 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2430
2431 if (!strcmp("__intrinsic_atomic_read", callee)) {
2432 emit_untyped_surface_read(surf_index, dst, offset);
2433
2434 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2435 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2436 src_reg(), src_reg());
2437
2438 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2439 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2440 src_reg(), src_reg());
2441 }
2442 }
2443
2444 void
2445 vec4_visitor::visit(ir_call *ir)
2446 {
2447 const char *callee = ir->callee->function_name();
2448
2449 if (!strcmp("__intrinsic_atomic_read", callee) ||
2450 !strcmp("__intrinsic_atomic_increment", callee) ||
2451 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2452 visit_atomic_counter_intrinsic(ir);
2453 } else {
2454 unreachable("Unsupported intrinsic.");
2455 }
2456 }
2457
2458 src_reg
2459 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2460 {
2461 vec4_instruction *inst =
2462 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2463 dst_reg(this, glsl_type::uvec4_type));
2464 inst->base_mrf = 2;
2465 inst->src[1] = sampler;
2466
2467 int param_base;
2468
2469 if (devinfo->gen >= 9) {
2470 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2471 vec4_instruction *header_inst = new(mem_ctx)
2472 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2473 dst_reg(MRF, inst->base_mrf));
2474
2475 emit(header_inst);
2476
2477 inst->mlen = 2;
2478 inst->header_size = 1;
2479 param_base = inst->base_mrf + 1;
2480 } else {
2481 inst->mlen = 1;
2482 param_base = inst->base_mrf;
2483 }
2484
2485 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2486 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2487 int zero_mask = 0xf & ~coord_mask;
2488
2489 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2490 coordinate));
2491
2492 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2493 src_reg(0)));
2494
2495 emit(inst);
2496 return src_reg(inst->dst);
2497 }
2498
2499 static bool
2500 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2501 {
2502 if (devinfo->gen < 8 && !devinfo->is_haswell)
2503 return false;
2504
2505 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2506 }
2507
2508 void
2509 vec4_visitor::visit(ir_texture *ir)
2510 {
2511 uint32_t sampler =
2512 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2513
2514 ir_rvalue *nonconst_sampler_index =
2515 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2516
2517 /* Handle non-constant sampler array indexing */
2518 src_reg sampler_reg;
2519 if (nonconst_sampler_index) {
2520 /* The highest sampler which may be used by this operation is
2521 * the last element of the array. Mark it here, because the generator
2522 * doesn't have enough information to determine the bound.
2523 */
2524 uint32_t array_size = ir->sampler->as_dereference_array()
2525 ->array->type->array_size();
2526
2527 uint32_t max_used = sampler + array_size - 1;
2528 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2529 max_used += prog_data->base.binding_table.gather_texture_start;
2530 } else {
2531 max_used += prog_data->base.binding_table.texture_start;
2532 }
2533
2534 brw_mark_surface_used(&prog_data->base, max_used);
2535
2536 /* Emit code to evaluate the actual indexing expression */
2537 nonconst_sampler_index->accept(this);
2538 dst_reg temp(this, glsl_type::uint_type);
2539 emit(ADD(temp, this->result, src_reg(sampler)));
2540 emit_uniformize(temp, src_reg(temp));
2541
2542 sampler_reg = src_reg(temp);
2543 } else {
2544 /* Single sampler, or constant array index; the indexing expression
2545 * is just an immediate.
2546 */
2547 sampler_reg = src_reg(sampler);
2548 }
2549
2550 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2551 * emitting anything other than setting up the constant result.
2552 */
2553 if (ir->op == ir_tg4) {
2554 ir_constant *chan = ir->lod_info.component->as_constant();
2555 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2556 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2557 dst_reg result(this, ir->type);
2558 this->result = src_reg(result);
2559 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2560 return;
2561 }
2562 }
2563
2564 /* Should be lowered by do_lower_texture_projection */
2565 assert(!ir->projector);
2566
2567 /* Should be lowered */
2568 assert(!ir->offset || !ir->offset->type->is_array());
2569
2570 /* Generate code to compute all the subexpression trees. This has to be
2571 * done before loading any values into MRFs for the sampler message since
2572 * generating these values may involve SEND messages that need the MRFs.
2573 */
2574 src_reg coordinate;
2575 if (ir->coordinate) {
2576 ir->coordinate->accept(this);
2577 coordinate = this->result;
2578 }
2579
2580 src_reg shadow_comparitor;
2581 if (ir->shadow_comparitor) {
2582 ir->shadow_comparitor->accept(this);
2583 shadow_comparitor = this->result;
2584 }
2585
2586 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2587 src_reg offset_value;
2588 if (has_nonconstant_offset) {
2589 ir->offset->accept(this);
2590 offset_value = src_reg(this->result);
2591 }
2592
2593 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2594 src_reg lod, dPdx, dPdy, sample_index, mcs;
2595 switch (ir->op) {
2596 case ir_tex:
2597 lod = src_reg(0.0f);
2598 lod_type = glsl_type::float_type;
2599 break;
2600 case ir_txf:
2601 case ir_txl:
2602 case ir_txs:
2603 ir->lod_info.lod->accept(this);
2604 lod = this->result;
2605 lod_type = ir->lod_info.lod->type;
2606 break;
2607 case ir_query_levels:
2608 lod = src_reg(0);
2609 lod_type = glsl_type::int_type;
2610 break;
2611 case ir_txf_ms:
2612 ir->lod_info.sample_index->accept(this);
2613 sample_index = this->result;
2614 sample_index_type = ir->lod_info.sample_index->type;
2615
2616 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2617 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2618 else
2619 mcs = src_reg(0u);
2620 break;
2621 case ir_txd:
2622 ir->lod_info.grad.dPdx->accept(this);
2623 dPdx = this->result;
2624
2625 ir->lod_info.grad.dPdy->accept(this);
2626 dPdy = this->result;
2627
2628 lod_type = ir->lod_info.grad.dPdx->type;
2629 break;
2630 case ir_txb:
2631 case ir_lod:
2632 case ir_tg4:
2633 break;
2634 }
2635
2636 enum opcode opcode;
2637 switch (ir->op) {
2638 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2639 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2640 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2641 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2642 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2643 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2644 case ir_tg4: opcode = has_nonconstant_offset
2645 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2646 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2647 case ir_txb:
2648 unreachable("TXB is not valid for vertex shaders.");
2649 case ir_lod:
2650 unreachable("LOD is not valid for vertex shaders.");
2651 default:
2652 unreachable("Unrecognized tex op");
2653 }
2654
2655 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2656 opcode, dst_reg(this, ir->type));
2657
2658 if (ir->offset != NULL && !has_nonconstant_offset) {
2659 inst->offset =
2660 brw_texture_offset(ir->offset->as_constant()->value.i,
2661 ir->offset->type->vector_elements);
2662 }
2663
2664 /* Stuff the channel select bits in the top of the texture offset */
2665 if (ir->op == ir_tg4)
2666 inst->offset |= gather_channel(ir, sampler) << 16;
2667
2668 /* The message header is necessary for:
2669 * - Gen4 (always)
2670 * - Gen9+ for selecting SIMD4x2
2671 * - Texel offsets
2672 * - Gather channel selection
2673 * - Sampler indices too large to fit in a 4-bit value.
2674 */
2675 inst->header_size =
2676 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2677 inst->offset != 0 || ir->op == ir_tg4 ||
2678 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2679 inst->base_mrf = 2;
2680 inst->mlen = inst->header_size + 1; /* always at least one */
2681 inst->dst.writemask = WRITEMASK_XYZW;
2682 inst->shadow_compare = ir->shadow_comparitor != NULL;
2683
2684 inst->src[1] = sampler_reg;
2685
2686 /* MRF for the first parameter */
2687 int param_base = inst->base_mrf + inst->header_size;
2688
2689 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2690 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2691 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2692 } else {
2693 /* Load the coordinate */
2694 /* FINISHME: gl_clamp_mask and saturate */
2695 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2696 int zero_mask = 0xf & ~coord_mask;
2697
2698 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2699 coordinate));
2700
2701 if (zero_mask != 0) {
2702 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2703 src_reg(0)));
2704 }
2705 /* Load the shadow comparitor */
2706 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2707 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2708 WRITEMASK_X),
2709 shadow_comparitor));
2710 inst->mlen++;
2711 }
2712
2713 /* Load the LOD info */
2714 if (ir->op == ir_tex || ir->op == ir_txl) {
2715 int mrf, writemask;
2716 if (devinfo->gen >= 5) {
2717 mrf = param_base + 1;
2718 if (ir->shadow_comparitor) {
2719 writemask = WRITEMASK_Y;
2720 /* mlen already incremented */
2721 } else {
2722 writemask = WRITEMASK_X;
2723 inst->mlen++;
2724 }
2725 } else /* devinfo->gen == 4 */ {
2726 mrf = param_base;
2727 writemask = WRITEMASK_W;
2728 }
2729 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2730 } else if (ir->op == ir_txf) {
2731 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2732 } else if (ir->op == ir_txf_ms) {
2733 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2734 sample_index));
2735 if (devinfo->gen >= 7) {
2736 /* MCS data is in the first channel of `mcs`, but we need to get it into
2737 * the .y channel of the second vec4 of params, so replicate .x across
2738 * the whole vec4 and then mask off everything except .y
2739 */
2740 mcs.swizzle = BRW_SWIZZLE_XXXX;
2741 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2742 mcs));
2743 }
2744 inst->mlen++;
2745 } else if (ir->op == ir_txd) {
2746 const glsl_type *type = lod_type;
2747
2748 if (devinfo->gen >= 5) {
2749 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2750 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2751 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2752 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2753 inst->mlen++;
2754
2755 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2756 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2757 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2758 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2759 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2760 inst->mlen++;
2761
2762 if (ir->shadow_comparitor) {
2763 emit(MOV(dst_reg(MRF, param_base + 2,
2764 ir->shadow_comparitor->type, WRITEMASK_Z),
2765 shadow_comparitor));
2766 }
2767 }
2768 } else /* devinfo->gen == 4 */ {
2769 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2770 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2771 inst->mlen += 2;
2772 }
2773 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2774 if (ir->shadow_comparitor) {
2775 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2776 shadow_comparitor));
2777 }
2778
2779 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2780 offset_value));
2781 inst->mlen++;
2782 }
2783 }
2784
2785 emit(inst);
2786
2787 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2788 * spec requires layers.
2789 */
2790 if (ir->op == ir_txs) {
2791 glsl_type const *type = ir->sampler->type;
2792 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2793 type->sampler_array) {
2794 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2795 writemask(inst->dst, WRITEMASK_Z),
2796 src_reg(inst->dst), src_reg(6));
2797 }
2798 }
2799
2800 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2801 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2802 }
2803
2804 swizzle_result(ir, src_reg(inst->dst), sampler);
2805 }
2806
2807 /**
2808 * Apply workarounds for Gen6 gather with UINT/SINT
2809 */
2810 void
2811 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2812 {
2813 if (!wa)
2814 return;
2815
2816 int width = (wa & WA_8BIT) ? 8 : 16;
2817 dst_reg dst_f = dst;
2818 dst_f.type = BRW_REGISTER_TYPE_F;
2819
2820 /* Convert from UNORM to UINT */
2821 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2822 emit(MOV(dst, src_reg(dst_f)));
2823
2824 if (wa & WA_SIGN) {
2825 /* Reinterpret the UINT value as a signed INT value by
2826 * shifting the sign bit into place, then shifting back
2827 * preserving sign.
2828 */
2829 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2830 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2831 }
2832 }
2833
2834 /**
2835 * Set up the gather channel based on the swizzle, for gather4.
2836 */
2837 uint32_t
2838 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2839 {
2840 ir_constant *chan = ir->lod_info.component->as_constant();
2841 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2842 switch (swiz) {
2843 case SWIZZLE_X: return 0;
2844 case SWIZZLE_Y:
2845 /* gather4 sampler is broken for green channel on RG32F --
2846 * we must ask for blue instead.
2847 */
2848 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2849 return 2;
2850 return 1;
2851 case SWIZZLE_Z: return 2;
2852 case SWIZZLE_W: return 3;
2853 default:
2854 unreachable("Not reached"); /* zero, one swizzles handled already */
2855 }
2856 }
2857
2858 void
2859 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2860 {
2861 int s = key->tex.swizzles[sampler];
2862
2863 this->result = src_reg(this, ir->type);
2864 dst_reg swizzled_result(this->result);
2865
2866 if (ir->op == ir_query_levels) {
2867 /* # levels is in .w */
2868 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2869 emit(MOV(swizzled_result, orig_val));
2870 return;
2871 }
2872
2873 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2874 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2875 emit(MOV(swizzled_result, orig_val));
2876 return;
2877 }
2878
2879
2880 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2881 int swizzle[4] = {0};
2882
2883 for (int i = 0; i < 4; i++) {
2884 switch (GET_SWZ(s, i)) {
2885 case SWIZZLE_ZERO:
2886 zero_mask |= (1 << i);
2887 break;
2888 case SWIZZLE_ONE:
2889 one_mask |= (1 << i);
2890 break;
2891 default:
2892 copy_mask |= (1 << i);
2893 swizzle[i] = GET_SWZ(s, i);
2894 break;
2895 }
2896 }
2897
2898 if (copy_mask) {
2899 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2900 swizzled_result.writemask = copy_mask;
2901 emit(MOV(swizzled_result, orig_val));
2902 }
2903
2904 if (zero_mask) {
2905 swizzled_result.writemask = zero_mask;
2906 emit(MOV(swizzled_result, src_reg(0.0f)));
2907 }
2908
2909 if (one_mask) {
2910 swizzled_result.writemask = one_mask;
2911 emit(MOV(swizzled_result, src_reg(1.0f)));
2912 }
2913 }
2914
2915 void
2916 vec4_visitor::visit(ir_return *)
2917 {
2918 unreachable("not reached");
2919 }
2920
2921 void
2922 vec4_visitor::visit(ir_discard *)
2923 {
2924 unreachable("not reached");
2925 }
2926
2927 void
2928 vec4_visitor::visit(ir_if *ir)
2929 {
2930 /* Don't point the annotation at the if statement, because then it plus
2931 * the then and else blocks get printed.
2932 */
2933 this->base_ir = ir->condition;
2934
2935 if (devinfo->gen == 6) {
2936 emit_if_gen6(ir);
2937 } else {
2938 enum brw_predicate predicate;
2939 emit_bool_to_cond_code(ir->condition, &predicate);
2940 emit(IF(predicate));
2941 }
2942
2943 visit_instructions(&ir->then_instructions);
2944
2945 if (!ir->else_instructions.is_empty()) {
2946 this->base_ir = ir->condition;
2947 emit(BRW_OPCODE_ELSE);
2948
2949 visit_instructions(&ir->else_instructions);
2950 }
2951
2952 this->base_ir = ir->condition;
2953 emit(BRW_OPCODE_ENDIF);
2954 }
2955
2956 void
2957 vec4_visitor::visit(ir_emit_vertex *)
2958 {
2959 unreachable("not reached");
2960 }
2961
2962 void
2963 vec4_visitor::visit(ir_end_primitive *)
2964 {
2965 unreachable("not reached");
2966 }
2967
2968 void
2969 vec4_visitor::visit(ir_barrier *)
2970 {
2971 unreachable("not reached");
2972 }
2973
2974 void
2975 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2976 dst_reg dst, src_reg offset,
2977 src_reg src0, src_reg src1)
2978 {
2979 unsigned mlen = 0;
2980
2981 /* Set the atomic operation offset. */
2982 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2983 mlen++;
2984
2985 /* Set the atomic operation arguments. */
2986 if (src0.file != BAD_FILE) {
2987 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2988 mlen++;
2989 }
2990
2991 if (src1.file != BAD_FILE) {
2992 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2993 mlen++;
2994 }
2995
2996 /* Emit the instruction. Note that this maps to the normal SIMD8
2997 * untyped atomic message on Ivy Bridge, but that's OK because
2998 * unused channels will be masked out.
2999 */
3000 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3001 brw_message_reg(0),
3002 src_reg(surf_index), src_reg(atomic_op));
3003 inst->mlen = mlen;
3004 }
3005
3006 void
3007 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3008 src_reg offset)
3009 {
3010 /* Set the surface read offset. */
3011 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3012
3013 /* Emit the instruction. Note that this maps to the normal SIMD8
3014 * untyped surface read message, but that's OK because unused
3015 * channels will be masked out.
3016 */
3017 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3018 brw_message_reg(0),
3019 src_reg(surf_index), src_reg(1));
3020 inst->mlen = 1;
3021 }
3022
3023 void
3024 vec4_visitor::emit_ndc_computation()
3025 {
3026 /* Get the position */
3027 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3028
3029 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3030 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3031 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3032
3033 current_annotation = "NDC";
3034 dst_reg ndc_w = ndc;
3035 ndc_w.writemask = WRITEMASK_W;
3036 src_reg pos_w = pos;
3037 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3038 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3039
3040 dst_reg ndc_xyz = ndc;
3041 ndc_xyz.writemask = WRITEMASK_XYZ;
3042
3043 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3044 }
3045
3046 void
3047 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3048 {
3049 if (devinfo->gen < 6 &&
3050 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3051 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3052 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3053 dst_reg header1_w = header1;
3054 header1_w.writemask = WRITEMASK_W;
3055
3056 emit(MOV(header1, 0u));
3057
3058 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3059 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3060
3061 current_annotation = "Point size";
3062 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3063 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3064 }
3065
3066 if (key->userclip_active) {
3067 current_annotation = "Clipping flags";
3068 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3069 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3070
3071 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3072 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3073 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3074
3075 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3076 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3077 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3078 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3079 }
3080
3081 /* i965 clipping workaround:
3082 * 1) Test for -ve rhw
3083 * 2) If set,
3084 * set ndc = (0,0,0,0)
3085 * set ucp[6] = 1
3086 *
3087 * Later, clipping will detect ucp[6] and ensure the primitive is
3088 * clipped against all fixed planes.
3089 */
3090 if (devinfo->has_negative_rhw_bug) {
3091 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3092 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3093 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3094 vec4_instruction *inst;
3095 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3096 inst->predicate = BRW_PREDICATE_NORMAL;
3097 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3098 inst->predicate = BRW_PREDICATE_NORMAL;
3099 }
3100
3101 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3102 } else if (devinfo->gen < 6) {
3103 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3104 } else {
3105 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3106 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3107 dst_reg reg_w = reg;
3108 reg_w.writemask = WRITEMASK_W;
3109 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3110 }
3111 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3112 dst_reg reg_y = reg;
3113 reg_y.writemask = WRITEMASK_Y;
3114 reg_y.type = BRW_REGISTER_TYPE_D;
3115 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3116 }
3117 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3118 dst_reg reg_z = reg;
3119 reg_z.writemask = WRITEMASK_Z;
3120 reg_z.type = BRW_REGISTER_TYPE_D;
3121 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3122 }
3123 }
3124 }
3125
3126 void
3127 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3128 {
3129 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3130 *
3131 * "If a linked set of shaders forming the vertex stage contains no
3132 * static write to gl_ClipVertex or gl_ClipDistance, but the
3133 * application has requested clipping against user clip planes through
3134 * the API, then the coordinate written to gl_Position is used for
3135 * comparison against the user clip planes."
3136 *
3137 * This function is only called if the shader didn't write to
3138 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3139 * if the user wrote to it; otherwise we use gl_Position.
3140 */
3141 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3142 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3143 clip_vertex = VARYING_SLOT_POS;
3144 }
3145
3146 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3147 ++i) {
3148 reg.writemask = 1 << i;
3149 emit(DP4(reg,
3150 src_reg(output_reg[clip_vertex]),
3151 src_reg(this->userplane[i + offset])));
3152 }
3153 }
3154
3155 vec4_instruction *
3156 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3157 {
3158 assert (varying < VARYING_SLOT_MAX);
3159 reg.type = output_reg[varying].type;
3160 current_annotation = output_reg_annotation[varying];
3161 /* Copy the register, saturating if necessary */
3162 return emit(MOV(reg, src_reg(output_reg[varying])));
3163 }
3164
3165 void
3166 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3167 {
3168 reg.type = BRW_REGISTER_TYPE_F;
3169
3170 switch (varying) {
3171 case VARYING_SLOT_PSIZ:
3172 {
3173 /* PSIZ is always in slot 0, and is coupled with other flags. */
3174 current_annotation = "indices, point width, clip flags";
3175 emit_psiz_and_flags(reg);
3176 break;
3177 }
3178 case BRW_VARYING_SLOT_NDC:
3179 current_annotation = "NDC";
3180 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3181 break;
3182 case VARYING_SLOT_POS:
3183 current_annotation = "gl_Position";
3184 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3185 break;
3186 case VARYING_SLOT_EDGE:
3187 /* This is present when doing unfilled polygons. We're supposed to copy
3188 * the edge flag from the user-provided vertex array
3189 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3190 * of that attribute (starts as 1.0f). This is then used in clipping to
3191 * determine which edges should be drawn as wireframe.
3192 */
3193 current_annotation = "edge flag";
3194 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3195 glsl_type::float_type, WRITEMASK_XYZW))));
3196 break;
3197 case BRW_VARYING_SLOT_PAD:
3198 /* No need to write to this slot */
3199 break;
3200 case VARYING_SLOT_COL0:
3201 case VARYING_SLOT_COL1:
3202 case VARYING_SLOT_BFC0:
3203 case VARYING_SLOT_BFC1: {
3204 /* These built-in varyings are only supported in compatibility mode,
3205 * and we only support GS in core profile. So, this must be a vertex
3206 * shader.
3207 */
3208 assert(stage == MESA_SHADER_VERTEX);
3209 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3210 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3211 inst->saturate = true;
3212 break;
3213 }
3214
3215 default:
3216 emit_generic_urb_slot(reg, varying);
3217 break;
3218 }
3219 }
3220
3221 static int
3222 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3223 {
3224 if (devinfo->gen >= 6) {
3225 /* URB data written (does not include the message header reg) must
3226 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3227 * section 5.4.3.2.2: URB_INTERLEAVED.
3228 *
3229 * URB entries are allocated on a multiple of 1024 bits, so an
3230 * extra 128 bits written here to make the end align to 256 is
3231 * no problem.
3232 */
3233 if ((mlen % 2) != 1)
3234 mlen++;
3235 }
3236
3237 return mlen;
3238 }
3239
3240
3241 /**
3242 * Generates the VUE payload plus the necessary URB write instructions to
3243 * output it.
3244 *
3245 * The VUE layout is documented in Volume 2a.
3246 */
3247 void
3248 vec4_visitor::emit_vertex()
3249 {
3250 /* MRF 0 is reserved for the debugger, so start with message header
3251 * in MRF 1.
3252 */
3253 int base_mrf = 1;
3254 int mrf = base_mrf;
3255 /* In the process of generating our URB write message contents, we
3256 * may need to unspill a register or load from an array. Those
3257 * reads would use MRFs 14-15.
3258 */
3259 int max_usable_mrf = 13;
3260
3261 /* The following assertion verifies that max_usable_mrf causes an
3262 * even-numbered amount of URB write data, which will meet gen6's
3263 * requirements for length alignment.
3264 */
3265 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3266
3267 /* First mrf is the g0-based message header containing URB handles and
3268 * such.
3269 */
3270 emit_urb_write_header(mrf++);
3271
3272 if (devinfo->gen < 6) {
3273 emit_ndc_computation();
3274 }
3275
3276 /* Lower legacy ff and ClipVertex clipping to clip distances */
3277 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3278 current_annotation = "user clip distances";
3279
3280 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3281 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3282
3283 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3284 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3285 }
3286
3287 /* We may need to split this up into several URB writes, so do them in a
3288 * loop.
3289 */
3290 int slot = 0;
3291 bool complete = false;
3292 do {
3293 /* URB offset is in URB row increments, and each of our MRFs is half of
3294 * one of those, since we're doing interleaved writes.
3295 */
3296 int offset = slot / 2;
3297
3298 mrf = base_mrf + 1;
3299 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3300 emit_urb_slot(dst_reg(MRF, mrf++),
3301 prog_data->vue_map.slot_to_varying[slot]);
3302
3303 /* If this was max_usable_mrf, we can't fit anything more into this
3304 * URB WRITE.
3305 */
3306 if (mrf > max_usable_mrf) {
3307 slot++;
3308 break;
3309 }
3310 }
3311
3312 complete = slot >= prog_data->vue_map.num_slots;
3313 current_annotation = "URB write";
3314 vec4_instruction *inst = emit_urb_write_opcode(complete);
3315 inst->base_mrf = base_mrf;
3316 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3317 inst->offset += offset;
3318 } while(!complete);
3319 }
3320
3321
3322 src_reg
3323 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3324 src_reg *reladdr, int reg_offset)
3325 {
3326 /* Because we store the values to scratch interleaved like our
3327 * vertex data, we need to scale the vec4 index by 2.
3328 */
3329 int message_header_scale = 2;
3330
3331 /* Pre-gen6, the message header uses byte offsets instead of vec4
3332 * (16-byte) offset units.
3333 */
3334 if (devinfo->gen < 6)
3335 message_header_scale *= 16;
3336
3337 if (reladdr) {
3338 src_reg index = src_reg(this, glsl_type::int_type);
3339
3340 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3341 src_reg(reg_offset)));
3342 emit_before(block, inst, MUL(dst_reg(index), index,
3343 src_reg(message_header_scale)));
3344
3345 return index;
3346 } else {
3347 return src_reg(reg_offset * message_header_scale);
3348 }
3349 }
3350
3351 src_reg
3352 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3353 src_reg *reladdr, int reg_offset)
3354 {
3355 if (reladdr) {
3356 src_reg index = src_reg(this, glsl_type::int_type);
3357
3358 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3359 src_reg(reg_offset)));
3360
3361 /* Pre-gen6, the message header uses byte offsets instead of vec4
3362 * (16-byte) offset units.
3363 */
3364 if (devinfo->gen < 6) {
3365 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3366 }
3367
3368 return index;
3369 } else if (devinfo->gen >= 8) {
3370 /* Store the offset in a GRF so we can send-from-GRF. */
3371 src_reg offset = src_reg(this, glsl_type::int_type);
3372 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3373 return offset;
3374 } else {
3375 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3376 return src_reg(reg_offset * message_header_scale);
3377 }
3378 }
3379
3380 /**
3381 * Emits an instruction before @inst to load the value named by @orig_src
3382 * from scratch space at @base_offset to @temp.
3383 *
3384 * @base_offset is measured in 32-byte units (the size of a register).
3385 */
3386 void
3387 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3388 dst_reg temp, src_reg orig_src,
3389 int base_offset)
3390 {
3391 int reg_offset = base_offset + orig_src.reg_offset;
3392 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3393 reg_offset);
3394
3395 emit_before(block, inst, SCRATCH_READ(temp, index));
3396 }
3397
3398 /**
3399 * Emits an instruction after @inst to store the value to be written
3400 * to @orig_dst to scratch space at @base_offset, from @temp.
3401 *
3402 * @base_offset is measured in 32-byte units (the size of a register).
3403 */
3404 void
3405 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3406 int base_offset)
3407 {
3408 int reg_offset = base_offset + inst->dst.reg_offset;
3409 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3410 reg_offset);
3411
3412 /* Create a temporary register to store *inst's result in.
3413 *
3414 * We have to be careful in MOVing from our temporary result register in
3415 * the scratch write. If we swizzle from channels of the temporary that
3416 * weren't initialized, it will confuse live interval analysis, which will
3417 * make spilling fail to make progress.
3418 */
3419 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3420 inst->dst.type),
3421 brw_swizzle_for_mask(inst->dst.writemask));
3422 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3423 inst->dst.writemask));
3424 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3425 write->predicate = inst->predicate;
3426 write->ir = inst->ir;
3427 write->annotation = inst->annotation;
3428 inst->insert_after(block, write);
3429
3430 inst->dst.file = temp.file;
3431 inst->dst.reg = temp.reg;
3432 inst->dst.reg_offset = temp.reg_offset;
3433 inst->dst.reladdr = NULL;
3434 }
3435
3436 /**
3437 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3438 * adds the scratch read(s) before \p inst. The function also checks for
3439 * recursive reladdr scratch accesses, issuing the corresponding scratch
3440 * loads and rewriting reladdr references accordingly.
3441 *
3442 * \return \p src if it did not require a scratch load, otherwise, the
3443 * register holding the result of the scratch load that the caller should
3444 * use to rewrite src.
3445 */
3446 src_reg
3447 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3448 vec4_instruction *inst, src_reg src)
3449 {
3450 /* Resolve recursive reladdr scratch access by calling ourselves
3451 * with src.reladdr
3452 */
3453 if (src.reladdr)
3454 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3455 *src.reladdr);
3456
3457 /* Now handle scratch access on src */
3458 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3459 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3460 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3461 src.reg = temp.reg;
3462 src.reg_offset = temp.reg_offset;
3463 src.reladdr = NULL;
3464 }
3465
3466 return src;
3467 }
3468
3469 /**
3470 * We can't generally support array access in GRF space, because a
3471 * single instruction's destination can only span 2 contiguous
3472 * registers. So, we send all GRF arrays that get variable index
3473 * access to scratch space.
3474 */
3475 void
3476 vec4_visitor::move_grf_array_access_to_scratch()
3477 {
3478 int scratch_loc[this->alloc.count];
3479 memset(scratch_loc, -1, sizeof(scratch_loc));
3480
3481 /* First, calculate the set of virtual GRFs that need to be punted
3482 * to scratch due to having any array access on them, and where in
3483 * scratch.
3484 */
3485 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3486 if (inst->dst.file == GRF && inst->dst.reladdr) {
3487 if (scratch_loc[inst->dst.reg] == -1) {
3488 scratch_loc[inst->dst.reg] = c->last_scratch;
3489 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3490 }
3491
3492 for (src_reg *iter = inst->dst.reladdr;
3493 iter->reladdr;
3494 iter = iter->reladdr) {
3495 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3496 scratch_loc[iter->reg] = c->last_scratch;
3497 c->last_scratch += this->alloc.sizes[iter->reg];
3498 }
3499 }
3500 }
3501
3502 for (int i = 0 ; i < 3; i++) {
3503 for (src_reg *iter = &inst->src[i];
3504 iter->reladdr;
3505 iter = iter->reladdr) {
3506 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3507 scratch_loc[iter->reg] = c->last_scratch;
3508 c->last_scratch += this->alloc.sizes[iter->reg];
3509 }
3510 }
3511 }
3512 }
3513
3514 /* Now, for anything that will be accessed through scratch, rewrite
3515 * it to load/store. Note that this is a _safe list walk, because
3516 * we may generate a new scratch_write instruction after the one
3517 * we're processing.
3518 */
3519 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3520 /* Set up the annotation tracking for new generated instructions. */
3521 base_ir = inst->ir;
3522 current_annotation = inst->annotation;
3523
3524 /* First handle scratch access on the dst. Notice we have to handle
3525 * the case where the dst's reladdr also points to scratch space.
3526 */
3527 if (inst->dst.reladdr)
3528 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3529 *inst->dst.reladdr);
3530
3531 /* Now that we have handled any (possibly recursive) reladdr scratch
3532 * accesses for dst we can safely do the scratch write for dst itself
3533 */
3534 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3535 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3536
3537 /* Now handle scratch access on any src. In this case, since inst->src[i]
3538 * already is a src_reg, we can just call emit_resolve_reladdr with
3539 * inst->src[i] and it will take care of handling scratch loads for
3540 * both src and src.reladdr (recursively).
3541 */
3542 for (int i = 0 ; i < 3; i++) {
3543 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3544 inst->src[i]);
3545 }
3546 }
3547 }
3548
3549 /**
3550 * Emits an instruction before @inst to load the value named by @orig_src
3551 * from the pull constant buffer (surface) at @base_offset to @temp.
3552 */
3553 void
3554 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3555 dst_reg temp, src_reg orig_src,
3556 int base_offset)
3557 {
3558 int reg_offset = base_offset + orig_src.reg_offset;
3559 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3560 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3561 reg_offset);
3562
3563 emit_pull_constant_load_reg(temp,
3564 index,
3565 offset,
3566 block, inst);
3567 }
3568
3569 /**
3570 * Implements array access of uniforms by inserting a
3571 * PULL_CONSTANT_LOAD instruction.
3572 *
3573 * Unlike temporary GRF array access (where we don't support it due to
3574 * the difficulty of doing relative addressing on instruction
3575 * destinations), we could potentially do array access of uniforms
3576 * that were loaded in GRF space as push constants. In real-world
3577 * usage we've seen, though, the arrays being used are always larger
3578 * than we could load as push constants, so just always move all
3579 * uniform array access out to a pull constant buffer.
3580 */
3581 void
3582 vec4_visitor::move_uniform_array_access_to_pull_constants()
3583 {
3584 int pull_constant_loc[this->uniforms];
3585 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3586 bool nested_reladdr;
3587
3588 /* Walk through and find array access of uniforms. Put a copy of that
3589 * uniform in the pull constant buffer.
3590 *
3591 * Note that we don't move constant-indexed accesses to arrays. No
3592 * testing has been done of the performance impact of this choice.
3593 */
3594 do {
3595 nested_reladdr = false;
3596
3597 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3598 for (int i = 0 ; i < 3; i++) {
3599 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3600 continue;
3601
3602 int uniform = inst->src[i].reg;
3603
3604 if (inst->src[i].reladdr->reladdr)
3605 nested_reladdr = true; /* will need another pass */
3606
3607 /* If this array isn't already present in the pull constant buffer,
3608 * add it.
3609 */
3610 if (pull_constant_loc[uniform] == -1) {
3611 const gl_constant_value **values =
3612 &stage_prog_data->param[uniform * 4];
3613
3614 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3615
3616 assert(uniform < uniform_array_size);
3617 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3618 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3619 = values[j];
3620 }
3621 }
3622
3623 /* Set up the annotation tracking for new generated instructions. */
3624 base_ir = inst->ir;
3625 current_annotation = inst->annotation;
3626
3627 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3628
3629 emit_pull_constant_load(block, inst, temp, inst->src[i],
3630 pull_constant_loc[uniform]);
3631
3632 inst->src[i].file = temp.file;
3633 inst->src[i].reg = temp.reg;
3634 inst->src[i].reg_offset = temp.reg_offset;
3635 inst->src[i].reladdr = NULL;
3636 }
3637 }
3638 } while (nested_reladdr);
3639
3640 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3641 * no need to track them as larger-than-vec4 objects. This will be
3642 * relied on in cutting out unused uniform vectors from push
3643 * constants.
3644 */
3645 split_uniform_registers();
3646 }
3647
3648 void
3649 vec4_visitor::resolve_ud_negate(src_reg *reg)
3650 {
3651 if (reg->type != BRW_REGISTER_TYPE_UD ||
3652 !reg->negate)
3653 return;
3654
3655 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3656 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3657 *reg = temp;
3658 }
3659
3660 /**
3661 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3662 *
3663 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3664 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3665 */
3666 void
3667 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3668 {
3669 assert(devinfo->gen <= 5);
3670
3671 if (!rvalue->type->is_boolean())
3672 return;
3673
3674 src_reg and_result = src_reg(this, rvalue->type);
3675 src_reg neg_result = src_reg(this, rvalue->type);
3676 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3677 emit(MOV(dst_reg(neg_result), negate(and_result)));
3678 *reg = neg_result;
3679 }
3680
3681 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3682 struct brw_vec4_compile *c,
3683 struct gl_program *prog,
3684 const struct brw_vue_prog_key *key,
3685 struct brw_vue_prog_data *prog_data,
3686 struct gl_shader_program *shader_prog,
3687 gl_shader_stage stage,
3688 void *mem_ctx,
3689 bool no_spills,
3690 int shader_time_index)
3691 : backend_shader(compiler, NULL, mem_ctx,
3692 shader_prog, prog, &prog_data->base, stage),
3693 c(c),
3694 key(key),
3695 prog_data(prog_data),
3696 sanity_param_count(0),
3697 fail_msg(NULL),
3698 first_non_payload_grf(0),
3699 need_all_constants_in_pull_buffer(false),
3700 no_spills(no_spills),
3701 shader_time_index(shader_time_index)
3702 {
3703 this->failed = false;
3704
3705 this->base_ir = NULL;
3706 this->current_annotation = NULL;
3707 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3708
3709 this->variable_ht = hash_table_ctor(0,
3710 hash_table_pointer_hash,
3711 hash_table_pointer_compare);
3712
3713 this->virtual_grf_start = NULL;
3714 this->virtual_grf_end = NULL;
3715 this->live_intervals = NULL;
3716
3717 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3718
3719 this->uniforms = 0;
3720
3721 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3722 * at least one. See setup_uniforms() in brw_vec4.cpp.
3723 */
3724 this->uniform_array_size = 1;
3725 if (prog_data) {
3726 this->uniform_array_size =
3727 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3728 }
3729
3730 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3731 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3732 }
3733
3734 vec4_visitor::~vec4_visitor()
3735 {
3736 hash_table_dtor(this->variable_ht);
3737 }
3738
3739
3740 void
3741 vec4_visitor::fail(const char *format, ...)
3742 {
3743 va_list va;
3744 char *msg;
3745
3746 if (failed)
3747 return;
3748
3749 failed = true;
3750
3751 va_start(va, format);
3752 msg = ralloc_vasprintf(mem_ctx, format, va);
3753 va_end(va);
3754 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3755
3756 this->fail_msg = msg;
3757
3758 if (debug_enabled) {
3759 fprintf(stderr, "%s", msg);
3760 }
3761 }
3762
3763 } /* namespace brw */