i965: Implement nir_op_uadd_carry and _usub_borrow without accumulator.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (storage->builtin)
690 continue;
691
692 if (strncmp(ir->name, storage->name, namelen) != 0 ||
693 (storage->name[namelen] != 0 &&
694 storage->name[namelen] != '.' &&
695 storage->name[namelen] != '[')) {
696 continue;
697 }
698
699 gl_constant_value *components = storage->storage;
700 unsigned vector_count = (MAX2(storage->array_elements, 1) *
701 storage->type->matrix_columns);
702
703 for (unsigned s = 0; s < vector_count; s++) {
704 assert(uniforms < uniform_array_size);
705 uniform_vector_size[uniforms] = storage->type->vector_elements;
706
707 int i;
708 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
709 stage_prog_data->param[uniforms * 4 + i] = components;
710 components++;
711 }
712 for (; i < 4; i++) {
713 static gl_constant_value zero = { 0.0 };
714 stage_prog_data->param[uniforms * 4 + i] = &zero;
715 }
716
717 uniforms++;
718 }
719 }
720 }
721
722 void
723 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
724 {
725 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
726 assert(this->uniforms < uniform_array_size);
727 this->uniform_vector_size[this->uniforms] = 4;
728 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
729 this->userplane[i].type = BRW_REGISTER_TYPE_F;
730 for (int j = 0; j < 4; ++j) {
731 stage_prog_data->param[this->uniforms * 4 + j] =
732 (gl_constant_value *) &clip_planes[i][j];
733 }
734 ++this->uniforms;
735 }
736 }
737
738 /* Our support for builtin uniforms is even scarier than non-builtin.
739 * It sits on top of the PROG_STATE_VAR parameters that are
740 * automatically updated from GL context state.
741 */
742 void
743 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
744 {
745 const ir_state_slot *const slots = ir->get_state_slots();
746 assert(slots != NULL);
747
748 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
749 /* This state reference has already been setup by ir_to_mesa,
750 * but we'll get the same index back here. We can reference
751 * ParameterValues directly, since unlike brw_fs.cpp, we never
752 * add new state references during compile.
753 */
754 int index = _mesa_add_state_reference(this->prog->Parameters,
755 (gl_state_index *)slots[i].tokens);
756 gl_constant_value *values =
757 &this->prog->Parameters->ParameterValues[index][0];
758
759 assert(this->uniforms < uniform_array_size);
760
761 for (unsigned j = 0; j < 4; j++)
762 stage_prog_data->param[this->uniforms * 4 + j] =
763 &values[GET_SWZ(slots[i].swizzle, j)];
764
765 this->uniform_vector_size[this->uniforms] =
766 (ir->type->is_scalar() || ir->type->is_vector() ||
767 ir->type->is_matrix() ? ir->type->vector_elements : 4);
768
769 this->uniforms++;
770 }
771 }
772
773 dst_reg *
774 vec4_visitor::variable_storage(ir_variable *var)
775 {
776 return (dst_reg *)hash_table_find(this->variable_ht, var);
777 }
778
779 void
780 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
781 enum brw_predicate *predicate)
782 {
783 ir_expression *expr = ir->as_expression();
784
785 *predicate = BRW_PREDICATE_NORMAL;
786
787 if (expr && expr->operation != ir_binop_ubo_load) {
788 src_reg op[3];
789 vec4_instruction *inst;
790
791 assert(expr->get_num_operands() <= 3);
792 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
793 expr->operands[i]->accept(this);
794 op[i] = this->result;
795
796 resolve_ud_negate(&op[i]);
797 }
798
799 switch (expr->operation) {
800 case ir_unop_logic_not:
801 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
802 inst->conditional_mod = BRW_CONDITIONAL_Z;
803 break;
804
805 case ir_binop_logic_xor:
806 if (devinfo->gen <= 5) {
807 src_reg temp = src_reg(this, ir->type);
808 emit(XOR(dst_reg(temp), op[0], op[1]));
809 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
810 } else {
811 inst = emit(XOR(dst_null_d(), op[0], op[1]));
812 }
813 inst->conditional_mod = BRW_CONDITIONAL_NZ;
814 break;
815
816 case ir_binop_logic_or:
817 if (devinfo->gen <= 5) {
818 src_reg temp = src_reg(this, ir->type);
819 emit(OR(dst_reg(temp), op[0], op[1]));
820 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
821 } else {
822 inst = emit(OR(dst_null_d(), op[0], op[1]));
823 }
824 inst->conditional_mod = BRW_CONDITIONAL_NZ;
825 break;
826
827 case ir_binop_logic_and:
828 if (devinfo->gen <= 5) {
829 src_reg temp = src_reg(this, ir->type);
830 emit(AND(dst_reg(temp), op[0], op[1]));
831 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
832 } else {
833 inst = emit(AND(dst_null_d(), op[0], op[1]));
834 }
835 inst->conditional_mod = BRW_CONDITIONAL_NZ;
836 break;
837
838 case ir_unop_f2b:
839 if (devinfo->gen >= 6) {
840 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
841 } else {
842 inst = emit(MOV(dst_null_f(), op[0]));
843 inst->conditional_mod = BRW_CONDITIONAL_NZ;
844 }
845 break;
846
847 case ir_unop_i2b:
848 if (devinfo->gen >= 6) {
849 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
850 } else {
851 inst = emit(MOV(dst_null_d(), op[0]));
852 inst->conditional_mod = BRW_CONDITIONAL_NZ;
853 }
854 break;
855
856 case ir_binop_all_equal:
857 if (devinfo->gen <= 5) {
858 resolve_bool_comparison(expr->operands[0], &op[0]);
859 resolve_bool_comparison(expr->operands[1], &op[1]);
860 }
861 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
862 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
863 break;
864
865 case ir_binop_any_nequal:
866 if (devinfo->gen <= 5) {
867 resolve_bool_comparison(expr->operands[0], &op[0]);
868 resolve_bool_comparison(expr->operands[1], &op[1]);
869 }
870 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
871 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
872 break;
873
874 case ir_unop_any:
875 if (devinfo->gen <= 5) {
876 resolve_bool_comparison(expr->operands[0], &op[0]);
877 }
878 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
879 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
880 break;
881
882 case ir_binop_greater:
883 case ir_binop_gequal:
884 case ir_binop_less:
885 case ir_binop_lequal:
886 case ir_binop_equal:
887 case ir_binop_nequal:
888 if (devinfo->gen <= 5) {
889 resolve_bool_comparison(expr->operands[0], &op[0]);
890 resolve_bool_comparison(expr->operands[1], &op[1]);
891 }
892 emit(CMP(dst_null_d(), op[0], op[1],
893 brw_conditional_for_comparison(expr->operation)));
894 break;
895
896 case ir_triop_csel: {
897 /* Expand the boolean condition into the flag register. */
898 inst = emit(MOV(dst_null_d(), op[0]));
899 inst->conditional_mod = BRW_CONDITIONAL_NZ;
900
901 /* Select which boolean to return. */
902 dst_reg temp(this, expr->operands[1]->type);
903 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
904 inst->predicate = BRW_PREDICATE_NORMAL;
905
906 /* Expand the result to a condition code. */
907 inst = emit(MOV(dst_null_d(), src_reg(temp)));
908 inst->conditional_mod = BRW_CONDITIONAL_NZ;
909 break;
910 }
911
912 default:
913 unreachable("not reached");
914 }
915 return;
916 }
917
918 ir->accept(this);
919
920 resolve_ud_negate(&this->result);
921
922 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
923 inst->conditional_mod = BRW_CONDITIONAL_NZ;
924 }
925
926 /**
927 * Emit a gen6 IF statement with the comparison folded into the IF
928 * instruction.
929 */
930 void
931 vec4_visitor::emit_if_gen6(ir_if *ir)
932 {
933 ir_expression *expr = ir->condition->as_expression();
934
935 if (expr && expr->operation != ir_binop_ubo_load) {
936 src_reg op[3];
937 dst_reg temp;
938
939 assert(expr->get_num_operands() <= 3);
940 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
941 expr->operands[i]->accept(this);
942 op[i] = this->result;
943 }
944
945 switch (expr->operation) {
946 case ir_unop_logic_not:
947 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
948 return;
949
950 case ir_binop_logic_xor:
951 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
952 return;
953
954 case ir_binop_logic_or:
955 temp = dst_reg(this, glsl_type::bool_type);
956 emit(OR(temp, op[0], op[1]));
957 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
958 return;
959
960 case ir_binop_logic_and:
961 temp = dst_reg(this, glsl_type::bool_type);
962 emit(AND(temp, op[0], op[1]));
963 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
964 return;
965
966 case ir_unop_f2b:
967 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_unop_i2b:
971 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_binop_greater:
975 case ir_binop_gequal:
976 case ir_binop_less:
977 case ir_binop_lequal:
978 case ir_binop_equal:
979 case ir_binop_nequal:
980 emit(IF(op[0], op[1],
981 brw_conditional_for_comparison(expr->operation)));
982 return;
983
984 case ir_binop_all_equal:
985 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
986 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
987 return;
988
989 case ir_binop_any_nequal:
990 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
991 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
992 return;
993
994 case ir_unop_any:
995 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
996 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
997 return;
998
999 case ir_triop_csel: {
1000 /* Expand the boolean condition into the flag register. */
1001 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1002 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1003
1004 /* Select which boolean to return. */
1005 dst_reg temp(this, expr->operands[1]->type);
1006 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1007 inst->predicate = BRW_PREDICATE_NORMAL;
1008
1009 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1010 return;
1011 }
1012
1013 default:
1014 unreachable("not reached");
1015 }
1016 return;
1017 }
1018
1019 ir->condition->accept(this);
1020
1021 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_variable *ir)
1026 {
1027 dst_reg *reg = NULL;
1028
1029 if (variable_storage(ir))
1030 return;
1031
1032 switch (ir->data.mode) {
1033 case ir_var_shader_in:
1034 assert(ir->data.location != -1);
1035 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1036 break;
1037
1038 case ir_var_shader_out:
1039 assert(ir->data.location != -1);
1040 reg = new(mem_ctx) dst_reg(this, ir->type);
1041
1042 for (int i = 0; i < type_size(ir->type); i++) {
1043 output_reg[ir->data.location + i] = *reg;
1044 output_reg[ir->data.location + i].reg_offset = i;
1045 output_reg[ir->data.location + i].type =
1046 brw_type_for_base_type(ir->type->get_scalar_type());
1047 output_reg_annotation[ir->data.location + i] = ir->name;
1048 }
1049 break;
1050
1051 case ir_var_auto:
1052 case ir_var_temporary:
1053 reg = new(mem_ctx) dst_reg(this, ir->type);
1054 break;
1055
1056 case ir_var_uniform:
1057 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1058
1059 /* Thanks to the lower_ubo_reference pass, we will see only
1060 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1061 * variables, so no need for them to be in variable_ht.
1062 *
1063 * Some uniforms, such as samplers and atomic counters, have no actual
1064 * storage, so we should ignore them.
1065 */
1066 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1067 return;
1068
1069 /* Track how big the whole uniform variable is, in case we need to put a
1070 * copy of its data into pull constants for array access.
1071 */
1072 assert(this->uniforms < uniform_array_size);
1073 this->uniform_size[this->uniforms] = type_size(ir->type);
1074
1075 if (!strncmp(ir->name, "gl_", 3)) {
1076 setup_builtin_uniform_values(ir);
1077 } else {
1078 setup_uniform_values(ir);
1079 }
1080 break;
1081
1082 case ir_var_system_value:
1083 reg = make_reg_for_system_value(ir);
1084 break;
1085
1086 default:
1087 unreachable("not reached");
1088 }
1089
1090 reg->type = brw_type_for_base_type(ir->type);
1091 hash_table_insert(this->variable_ht, reg, ir);
1092 }
1093
1094 void
1095 vec4_visitor::visit(ir_loop *ir)
1096 {
1097 /* We don't want debugging output to print the whole body of the
1098 * loop as the annotation.
1099 */
1100 this->base_ir = NULL;
1101
1102 emit(BRW_OPCODE_DO);
1103
1104 visit_instructions(&ir->body_instructions);
1105
1106 emit(BRW_OPCODE_WHILE);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop_jump *ir)
1111 {
1112 switch (ir->mode) {
1113 case ir_loop_jump::jump_break:
1114 emit(BRW_OPCODE_BREAK);
1115 break;
1116 case ir_loop_jump::jump_continue:
1117 emit(BRW_OPCODE_CONTINUE);
1118 break;
1119 }
1120 }
1121
1122
1123 void
1124 vec4_visitor::visit(ir_function_signature *)
1125 {
1126 unreachable("not reached");
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_function *ir)
1131 {
1132 /* Ignore function bodies other than main() -- we shouldn't see calls to
1133 * them since they should all be inlined.
1134 */
1135 if (strcmp(ir->name, "main") == 0) {
1136 const ir_function_signature *sig;
1137 exec_list empty;
1138
1139 sig = ir->matching_signature(NULL, &empty, false);
1140
1141 assert(sig);
1142
1143 visit_instructions(&sig->body);
1144 }
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_mad(ir_expression *ir)
1149 {
1150 /* 3-src instructions were introduced in gen6. */
1151 if (devinfo->gen < 6)
1152 return false;
1153
1154 /* MAD can only handle floating-point data. */
1155 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1156 return false;
1157
1158 ir_rvalue *nonmul;
1159 ir_expression *mul;
1160 bool mul_negate, mul_abs;
1161
1162 for (int i = 0; i < 2; i++) {
1163 mul_negate = false;
1164 mul_abs = false;
1165
1166 mul = ir->operands[i]->as_expression();
1167 nonmul = ir->operands[1 - i];
1168
1169 if (mul && mul->operation == ir_unop_abs) {
1170 mul = mul->operands[0]->as_expression();
1171 mul_abs = true;
1172 } else if (mul && mul->operation == ir_unop_neg) {
1173 mul = mul->operands[0]->as_expression();
1174 mul_negate = true;
1175 }
1176
1177 if (mul && mul->operation == ir_binop_mul)
1178 break;
1179 }
1180
1181 if (!mul || mul->operation != ir_binop_mul)
1182 return false;
1183
1184 nonmul->accept(this);
1185 src_reg src0 = fix_3src_operand(this->result);
1186
1187 mul->operands[0]->accept(this);
1188 src_reg src1 = fix_3src_operand(this->result);
1189 src1.negate ^= mul_negate;
1190 src1.abs = mul_abs;
1191 if (mul_abs)
1192 src1.negate = false;
1193
1194 mul->operands[1]->accept(this);
1195 src_reg src2 = fix_3src_operand(this->result);
1196 src2.abs = mul_abs;
1197 if (mul_abs)
1198 src2.negate = false;
1199
1200 this->result = src_reg(this, ir->type);
1201 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1202
1203 return true;
1204 }
1205
1206 bool
1207 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1208 {
1209 /* This optimization relies on CMP setting the destination to 0 when
1210 * false. Early hardware only sets the least significant bit, and
1211 * leaves the other bits undefined. So we can't use it.
1212 */
1213 if (devinfo->gen < 6)
1214 return false;
1215
1216 ir_expression *const cmp = ir->operands[0]->as_expression();
1217
1218 if (cmp == NULL)
1219 return false;
1220
1221 switch (cmp->operation) {
1222 case ir_binop_less:
1223 case ir_binop_greater:
1224 case ir_binop_lequal:
1225 case ir_binop_gequal:
1226 case ir_binop_equal:
1227 case ir_binop_nequal:
1228 break;
1229
1230 default:
1231 return false;
1232 }
1233
1234 cmp->operands[0]->accept(this);
1235 const src_reg cmp_src0 = this->result;
1236
1237 cmp->operands[1]->accept(this);
1238 const src_reg cmp_src1 = this->result;
1239
1240 this->result = src_reg(this, ir->type);
1241
1242 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1243 brw_conditional_for_comparison(cmp->operation)));
1244
1245 /* If the comparison is false, this->result will just happen to be zero.
1246 */
1247 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1248 this->result, src_reg(1.0f));
1249 inst->predicate = BRW_PREDICATE_NORMAL;
1250 inst->predicate_inverse = true;
1251
1252 return true;
1253 }
1254
1255 void
1256 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1257 src_reg src0, src_reg src1)
1258 {
1259 vec4_instruction *inst;
1260
1261 if (devinfo->gen >= 6) {
1262 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1263 inst->conditional_mod = conditionalmod;
1264 } else {
1265 emit(CMP(dst, src0, src1, conditionalmod));
1266
1267 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268 inst->predicate = BRW_PREDICATE_NORMAL;
1269 }
1270 }
1271
1272 void
1273 vec4_visitor::emit_lrp(const dst_reg &dst,
1274 const src_reg &x, const src_reg &y, const src_reg &a)
1275 {
1276 if (devinfo->gen >= 6) {
1277 /* Note that the instruction's argument order is reversed from GLSL
1278 * and the IR.
1279 */
1280 emit(LRP(dst,
1281 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1282 } else {
1283 /* Earlier generations don't support three source operations, so we
1284 * need to emit x*(1-a) + y*a.
1285 */
1286 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1287 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1289 y_times_a.writemask = dst.writemask;
1290 one_minus_a.writemask = dst.writemask;
1291 x_times_one_minus_a.writemask = dst.writemask;
1292
1293 emit(MUL(y_times_a, y, a));
1294 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1295 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1296 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1297 }
1298 }
1299
1300 /**
1301 * Emits the instructions needed to perform a pull constant load. before_block
1302 * and before_inst can be NULL in which case the instruction will be appended
1303 * to the end of the instruction list.
1304 */
1305 void
1306 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1307 src_reg surf_index,
1308 src_reg offset_reg,
1309 bblock_t *before_block,
1310 vec4_instruction *before_inst)
1311 {
1312 assert((before_inst == NULL && before_block == NULL) ||
1313 (before_inst && before_block));
1314
1315 vec4_instruction *pull;
1316
1317 if (devinfo->gen >= 9) {
1318 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1319 src_reg header(this, glsl_type::uvec4_type, 2);
1320
1321 pull = new(mem_ctx)
1322 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1323 dst_reg(header));
1324
1325 if (before_inst)
1326 emit_before(before_block, before_inst, pull);
1327 else
1328 emit(pull);
1329
1330 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1331 offset_reg.type);
1332 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1333
1334 if (before_inst)
1335 emit_before(before_block, before_inst, pull);
1336 else
1337 emit(pull);
1338
1339 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1340 dst,
1341 surf_index,
1342 header);
1343 pull->mlen = 2;
1344 pull->header_size = 1;
1345 } else if (devinfo->gen >= 7) {
1346 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1347
1348 grf_offset.type = offset_reg.type;
1349
1350 pull = MOV(grf_offset, offset_reg);
1351
1352 if (before_inst)
1353 emit_before(before_block, before_inst, pull);
1354 else
1355 emit(pull);
1356
1357 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1358 dst,
1359 surf_index,
1360 src_reg(grf_offset));
1361 pull->mlen = 1;
1362 } else {
1363 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1364 dst,
1365 surf_index,
1366 offset_reg);
1367 pull->base_mrf = 14;
1368 pull->mlen = 1;
1369 }
1370
1371 if (before_inst)
1372 emit_before(before_block, before_inst, pull);
1373 else
1374 emit(pull);
1375 }
1376
1377 void
1378 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1379 {
1380 const src_reg chan_index(this, glsl_type::uint_type);
1381
1382 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1383 ->force_writemask_all = true;
1384 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1385 ->force_writemask_all = true;
1386 }
1387
1388 void
1389 vec4_visitor::visit(ir_expression *ir)
1390 {
1391 unsigned int operand;
1392 src_reg op[ARRAY_SIZE(ir->operands)];
1393 vec4_instruction *inst;
1394
1395 if (ir->operation == ir_binop_add) {
1396 if (try_emit_mad(ir))
1397 return;
1398 }
1399
1400 if (ir->operation == ir_unop_b2f) {
1401 if (try_emit_b2f_of_compare(ir))
1402 return;
1403 }
1404
1405 /* Storage for our result. Ideally for an assignment we'd be using
1406 * the actual storage for the result here, instead.
1407 */
1408 dst_reg result_dst(this, ir->type);
1409 src_reg result_src(result_dst);
1410
1411 if (ir->operation == ir_triop_csel) {
1412 ir->operands[1]->accept(this);
1413 op[1] = this->result;
1414 ir->operands[2]->accept(this);
1415 op[2] = this->result;
1416
1417 enum brw_predicate predicate;
1418 emit_bool_to_cond_code(ir->operands[0], &predicate);
1419 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1420 inst->predicate = predicate;
1421 this->result = result_src;
1422 return;
1423 }
1424
1425 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1426 this->result.file = BAD_FILE;
1427 ir->operands[operand]->accept(this);
1428 if (this->result.file == BAD_FILE) {
1429 fprintf(stderr, "Failed to get tree for expression operand:\n");
1430 ir->operands[operand]->fprint(stderr);
1431 exit(1);
1432 }
1433 op[operand] = this->result;
1434
1435 /* Matrix expression operands should have been broken down to vector
1436 * operations already.
1437 */
1438 assert(!ir->operands[operand]->type->is_matrix());
1439 }
1440
1441 /* If nothing special happens, this is the result. */
1442 this->result = result_src;
1443
1444 switch (ir->operation) {
1445 case ir_unop_logic_not:
1446 emit(NOT(result_dst, op[0]));
1447 break;
1448 case ir_unop_neg:
1449 op[0].negate = !op[0].negate;
1450 emit(MOV(result_dst, op[0]));
1451 break;
1452 case ir_unop_abs:
1453 op[0].abs = true;
1454 op[0].negate = false;
1455 emit(MOV(result_dst, op[0]));
1456 break;
1457
1458 case ir_unop_sign:
1459 if (ir->type->is_float()) {
1460 /* AND(val, 0x80000000) gives the sign bit.
1461 *
1462 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1463 * zero.
1464 */
1465 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1466
1467 op[0].type = BRW_REGISTER_TYPE_UD;
1468 result_dst.type = BRW_REGISTER_TYPE_UD;
1469 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1470
1471 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1472 inst->predicate = BRW_PREDICATE_NORMAL;
1473
1474 this->result.type = BRW_REGISTER_TYPE_F;
1475 } else {
1476 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1477 * -> non-negative val generates 0x00000000.
1478 * Predicated OR sets 1 if val is positive.
1479 */
1480 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1481
1482 emit(ASR(result_dst, op[0], src_reg(31)));
1483
1484 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1485 inst->predicate = BRW_PREDICATE_NORMAL;
1486 }
1487 break;
1488
1489 case ir_unop_rcp:
1490 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1491 break;
1492
1493 case ir_unop_exp2:
1494 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1495 break;
1496 case ir_unop_log2:
1497 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1498 break;
1499 case ir_unop_exp:
1500 case ir_unop_log:
1501 unreachable("not reached: should be handled by ir_explog_to_explog2");
1502 case ir_unop_sin:
1503 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1504 break;
1505 case ir_unop_cos:
1506 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1507 break;
1508
1509 case ir_unop_dFdx:
1510 case ir_unop_dFdx_coarse:
1511 case ir_unop_dFdx_fine:
1512 case ir_unop_dFdy:
1513 case ir_unop_dFdy_coarse:
1514 case ir_unop_dFdy_fine:
1515 unreachable("derivatives not valid in vertex shader");
1516
1517 case ir_unop_bitfield_reverse:
1518 emit(BFREV(result_dst, op[0]));
1519 break;
1520 case ir_unop_bit_count:
1521 emit(CBIT(result_dst, op[0]));
1522 break;
1523 case ir_unop_find_msb: {
1524 src_reg temp = src_reg(this, glsl_type::uint_type);
1525
1526 inst = emit(FBH(dst_reg(temp), op[0]));
1527 inst->dst.writemask = WRITEMASK_XYZW;
1528
1529 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1530 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1531 * subtract the result from 31 to convert the MSB count into an LSB count.
1532 */
1533
1534 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1535 temp.swizzle = BRW_SWIZZLE_NOOP;
1536 emit(MOV(result_dst, temp));
1537
1538 src_reg src_tmp = src_reg(result_dst);
1539 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1540
1541 src_tmp.negate = true;
1542 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1543 inst->predicate = BRW_PREDICATE_NORMAL;
1544 break;
1545 }
1546 case ir_unop_find_lsb:
1547 emit(FBL(result_dst, op[0]));
1548 break;
1549 case ir_unop_saturate:
1550 inst = emit(MOV(result_dst, op[0]));
1551 inst->saturate = true;
1552 break;
1553
1554 case ir_unop_noise:
1555 unreachable("not reached: should be handled by lower_noise");
1556
1557 case ir_binop_add:
1558 emit(ADD(result_dst, op[0], op[1]));
1559 break;
1560 case ir_binop_sub:
1561 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1562
1563 case ir_binop_mul:
1564 if (devinfo->gen < 8 && ir->type->is_integer()) {
1565 /* For integer multiplication, the MUL uses the low 16 bits of one of
1566 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1567 * accumulates in the contribution of the upper 16 bits of that
1568 * operand. If we can determine that one of the args is in the low
1569 * 16 bits, though, we can just emit a single MUL.
1570 */
1571 if (ir->operands[0]->is_uint16_constant()) {
1572 if (devinfo->gen < 7)
1573 emit(MUL(result_dst, op[0], op[1]));
1574 else
1575 emit(MUL(result_dst, op[1], op[0]));
1576 } else if (ir->operands[1]->is_uint16_constant()) {
1577 if (devinfo->gen < 7)
1578 emit(MUL(result_dst, op[1], op[0]));
1579 else
1580 emit(MUL(result_dst, op[0], op[1]));
1581 } else {
1582 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1583
1584 emit(MUL(acc, op[0], op[1]));
1585 emit(MACH(dst_null_d(), op[0], op[1]));
1586 emit(MOV(result_dst, src_reg(acc)));
1587 }
1588 } else {
1589 emit(MUL(result_dst, op[0], op[1]));
1590 }
1591 break;
1592 case ir_binop_imul_high: {
1593 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1594
1595 emit(MUL(acc, op[0], op[1]));
1596 emit(MACH(result_dst, op[0], op[1]));
1597 break;
1598 }
1599 case ir_binop_div:
1600 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1601 assert(ir->type->is_integer());
1602 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1603 break;
1604
1605 case ir_binop_carry:
1606 unreachable("Should have been lowered by carry_to_arith().");
1607
1608 case ir_binop_borrow:
1609 unreachable("Should have been lowered by borrow_to_arith().");
1610
1611 case ir_binop_mod:
1612 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1613 assert(ir->type->is_integer());
1614 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1615 break;
1616
1617 case ir_binop_less:
1618 case ir_binop_greater:
1619 case ir_binop_lequal:
1620 case ir_binop_gequal:
1621 case ir_binop_equal:
1622 case ir_binop_nequal: {
1623 if (devinfo->gen <= 5) {
1624 resolve_bool_comparison(ir->operands[0], &op[0]);
1625 resolve_bool_comparison(ir->operands[1], &op[1]);
1626 }
1627 emit(CMP(result_dst, op[0], op[1],
1628 brw_conditional_for_comparison(ir->operation)));
1629 break;
1630 }
1631
1632 case ir_binop_all_equal:
1633 if (devinfo->gen <= 5) {
1634 resolve_bool_comparison(ir->operands[0], &op[0]);
1635 resolve_bool_comparison(ir->operands[1], &op[1]);
1636 }
1637
1638 /* "==" operator producing a scalar boolean. */
1639 if (ir->operands[0]->type->is_vector() ||
1640 ir->operands[1]->type->is_vector()) {
1641 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1642 emit(MOV(result_dst, src_reg(0)));
1643 inst = emit(MOV(result_dst, src_reg(~0)));
1644 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1645 } else {
1646 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1647 }
1648 break;
1649 case ir_binop_any_nequal:
1650 if (devinfo->gen <= 5) {
1651 resolve_bool_comparison(ir->operands[0], &op[0]);
1652 resolve_bool_comparison(ir->operands[1], &op[1]);
1653 }
1654
1655 /* "!=" operator producing a scalar boolean. */
1656 if (ir->operands[0]->type->is_vector() ||
1657 ir->operands[1]->type->is_vector()) {
1658 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1659
1660 emit(MOV(result_dst, src_reg(0)));
1661 inst = emit(MOV(result_dst, src_reg(~0)));
1662 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1663 } else {
1664 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1665 }
1666 break;
1667
1668 case ir_unop_any:
1669 if (devinfo->gen <= 5) {
1670 resolve_bool_comparison(ir->operands[0], &op[0]);
1671 }
1672 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1673 emit(MOV(result_dst, src_reg(0)));
1674
1675 inst = emit(MOV(result_dst, src_reg(~0)));
1676 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1677 break;
1678
1679 case ir_binop_logic_xor:
1680 emit(XOR(result_dst, op[0], op[1]));
1681 break;
1682
1683 case ir_binop_logic_or:
1684 emit(OR(result_dst, op[0], op[1]));
1685 break;
1686
1687 case ir_binop_logic_and:
1688 emit(AND(result_dst, op[0], op[1]));
1689 break;
1690
1691 case ir_binop_dot:
1692 assert(ir->operands[0]->type->is_vector());
1693 assert(ir->operands[0]->type == ir->operands[1]->type);
1694 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1695 break;
1696
1697 case ir_unop_sqrt:
1698 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1699 break;
1700 case ir_unop_rsq:
1701 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1702 break;
1703
1704 case ir_unop_bitcast_i2f:
1705 case ir_unop_bitcast_u2f:
1706 this->result = op[0];
1707 this->result.type = BRW_REGISTER_TYPE_F;
1708 break;
1709
1710 case ir_unop_bitcast_f2i:
1711 this->result = op[0];
1712 this->result.type = BRW_REGISTER_TYPE_D;
1713 break;
1714
1715 case ir_unop_bitcast_f2u:
1716 this->result = op[0];
1717 this->result.type = BRW_REGISTER_TYPE_UD;
1718 break;
1719
1720 case ir_unop_i2f:
1721 case ir_unop_i2u:
1722 case ir_unop_u2i:
1723 case ir_unop_u2f:
1724 case ir_unop_f2i:
1725 case ir_unop_f2u:
1726 emit(MOV(result_dst, op[0]));
1727 break;
1728 case ir_unop_b2i:
1729 case ir_unop_b2f:
1730 if (devinfo->gen <= 5) {
1731 resolve_bool_comparison(ir->operands[0], &op[0]);
1732 }
1733 emit(MOV(result_dst, negate(op[0])));
1734 break;
1735 case ir_unop_f2b:
1736 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1737 break;
1738 case ir_unop_i2b:
1739 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1740 break;
1741
1742 case ir_unop_trunc:
1743 emit(RNDZ(result_dst, op[0]));
1744 break;
1745 case ir_unop_ceil: {
1746 src_reg tmp = src_reg(this, ir->type);
1747 op[0].negate = !op[0].negate;
1748 emit(RNDD(dst_reg(tmp), op[0]));
1749 tmp.negate = true;
1750 emit(MOV(result_dst, tmp));
1751 }
1752 break;
1753 case ir_unop_floor:
1754 inst = emit(RNDD(result_dst, op[0]));
1755 break;
1756 case ir_unop_fract:
1757 inst = emit(FRC(result_dst, op[0]));
1758 break;
1759 case ir_unop_round_even:
1760 emit(RNDE(result_dst, op[0]));
1761 break;
1762
1763 case ir_binop_min:
1764 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1765 break;
1766 case ir_binop_max:
1767 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1768 break;
1769
1770 case ir_binop_pow:
1771 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1772 break;
1773
1774 case ir_unop_bit_not:
1775 inst = emit(NOT(result_dst, op[0]));
1776 break;
1777 case ir_binop_bit_and:
1778 inst = emit(AND(result_dst, op[0], op[1]));
1779 break;
1780 case ir_binop_bit_xor:
1781 inst = emit(XOR(result_dst, op[0], op[1]));
1782 break;
1783 case ir_binop_bit_or:
1784 inst = emit(OR(result_dst, op[0], op[1]));
1785 break;
1786
1787 case ir_binop_lshift:
1788 inst = emit(SHL(result_dst, op[0], op[1]));
1789 break;
1790
1791 case ir_binop_rshift:
1792 if (ir->type->base_type == GLSL_TYPE_INT)
1793 inst = emit(ASR(result_dst, op[0], op[1]));
1794 else
1795 inst = emit(SHR(result_dst, op[0], op[1]));
1796 break;
1797
1798 case ir_binop_bfm:
1799 emit(BFI1(result_dst, op[0], op[1]));
1800 break;
1801
1802 case ir_binop_ubo_load: {
1803 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1804 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1805 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1806 src_reg offset;
1807
1808 /* Now, load the vector from that offset. */
1809 assert(ir->type->is_vector() || ir->type->is_scalar());
1810
1811 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1812 packed_consts.type = result.type;
1813 src_reg surf_index;
1814
1815 if (const_uniform_block) {
1816 /* The block index is a constant, so just emit the binding table entry
1817 * as an immediate.
1818 */
1819 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1820 const_uniform_block->value.u[0]);
1821 } else {
1822 /* The block index is not a constant. Evaluate the index expression
1823 * per-channel and add the base UBO index; we have to select a value
1824 * from any live channel.
1825 */
1826 surf_index = src_reg(this, glsl_type::uint_type);
1827 emit(ADD(dst_reg(surf_index), op[0],
1828 src_reg(prog_data->base.binding_table.ubo_start)));
1829 emit_uniformize(dst_reg(surf_index), surf_index);
1830
1831 /* Assume this may touch any UBO. It would be nice to provide
1832 * a tighter bound, but the array information is already lowered away.
1833 */
1834 brw_mark_surface_used(&prog_data->base,
1835 prog_data->base.binding_table.ubo_start +
1836 shader_prog->NumUniformBlocks - 1);
1837 }
1838
1839 if (const_offset_ir) {
1840 if (devinfo->gen >= 8) {
1841 /* Store the offset in a GRF so we can send-from-GRF. */
1842 offset = src_reg(this, glsl_type::int_type);
1843 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1844 } else {
1845 /* Immediates are fine on older generations since they'll be moved
1846 * to a (potentially fake) MRF at the generator level.
1847 */
1848 offset = src_reg(const_offset / 16);
1849 }
1850 } else {
1851 offset = src_reg(this, glsl_type::uint_type);
1852 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1853 }
1854
1855 emit_pull_constant_load_reg(dst_reg(packed_consts),
1856 surf_index,
1857 offset,
1858 NULL, NULL /* before_block/inst */);
1859
1860 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1861 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1862 const_offset % 16 / 4,
1863 const_offset % 16 / 4,
1864 const_offset % 16 / 4);
1865
1866 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1867 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1868 emit(CMP(result_dst, packed_consts, src_reg(0u),
1869 BRW_CONDITIONAL_NZ));
1870 } else {
1871 emit(MOV(result_dst, packed_consts));
1872 }
1873 break;
1874 }
1875
1876 case ir_binop_vector_extract:
1877 unreachable("should have been lowered by vec_index_to_cond_assign");
1878
1879 case ir_triop_fma:
1880 op[0] = fix_3src_operand(op[0]);
1881 op[1] = fix_3src_operand(op[1]);
1882 op[2] = fix_3src_operand(op[2]);
1883 /* Note that the instruction's argument order is reversed from GLSL
1884 * and the IR.
1885 */
1886 emit(MAD(result_dst, op[2], op[1], op[0]));
1887 break;
1888
1889 case ir_triop_lrp:
1890 emit_lrp(result_dst, op[0], op[1], op[2]);
1891 break;
1892
1893 case ir_triop_csel:
1894 unreachable("already handled above");
1895 break;
1896
1897 case ir_triop_bfi:
1898 op[0] = fix_3src_operand(op[0]);
1899 op[1] = fix_3src_operand(op[1]);
1900 op[2] = fix_3src_operand(op[2]);
1901 emit(BFI2(result_dst, op[0], op[1], op[2]));
1902 break;
1903
1904 case ir_triop_bitfield_extract:
1905 op[0] = fix_3src_operand(op[0]);
1906 op[1] = fix_3src_operand(op[1]);
1907 op[2] = fix_3src_operand(op[2]);
1908 /* Note that the instruction's argument order is reversed from GLSL
1909 * and the IR.
1910 */
1911 emit(BFE(result_dst, op[2], op[1], op[0]));
1912 break;
1913
1914 case ir_triop_vector_insert:
1915 unreachable("should have been lowered by lower_vector_insert");
1916
1917 case ir_quadop_bitfield_insert:
1918 unreachable("not reached: should be handled by "
1919 "bitfield_insert_to_bfm_bfi\n");
1920
1921 case ir_quadop_vector:
1922 unreachable("not reached: should be handled by lower_quadop_vector");
1923
1924 case ir_unop_pack_half_2x16:
1925 emit_pack_half_2x16(result_dst, op[0]);
1926 break;
1927 case ir_unop_unpack_half_2x16:
1928 emit_unpack_half_2x16(result_dst, op[0]);
1929 break;
1930 case ir_unop_unpack_unorm_4x8:
1931 emit_unpack_unorm_4x8(result_dst, op[0]);
1932 break;
1933 case ir_unop_unpack_snorm_4x8:
1934 emit_unpack_snorm_4x8(result_dst, op[0]);
1935 break;
1936 case ir_unop_pack_unorm_4x8:
1937 emit_pack_unorm_4x8(result_dst, op[0]);
1938 break;
1939 case ir_unop_pack_snorm_4x8:
1940 emit_pack_snorm_4x8(result_dst, op[0]);
1941 break;
1942 case ir_unop_pack_snorm_2x16:
1943 case ir_unop_pack_unorm_2x16:
1944 case ir_unop_unpack_snorm_2x16:
1945 case ir_unop_unpack_unorm_2x16:
1946 unreachable("not reached: should be handled by lower_packing_builtins");
1947 case ir_unop_unpack_half_2x16_split_x:
1948 case ir_unop_unpack_half_2x16_split_y:
1949 case ir_binop_pack_half_2x16_split:
1950 case ir_unop_interpolate_at_centroid:
1951 case ir_binop_interpolate_at_sample:
1952 case ir_binop_interpolate_at_offset:
1953 unreachable("not reached: should not occur in vertex shader");
1954 case ir_binop_ldexp:
1955 unreachable("not reached: should be handled by ldexp_to_arith()");
1956 case ir_unop_d2f:
1957 case ir_unop_f2d:
1958 case ir_unop_d2i:
1959 case ir_unop_i2d:
1960 case ir_unop_d2u:
1961 case ir_unop_u2d:
1962 case ir_unop_d2b:
1963 case ir_unop_pack_double_2x32:
1964 case ir_unop_unpack_double_2x32:
1965 case ir_unop_frexp_sig:
1966 case ir_unop_frexp_exp:
1967 unreachable("fp64 todo");
1968 }
1969 }
1970
1971
1972 void
1973 vec4_visitor::visit(ir_swizzle *ir)
1974 {
1975 /* Note that this is only swizzles in expressions, not those on the left
1976 * hand side of an assignment, which do write masking. See ir_assignment
1977 * for that.
1978 */
1979 const unsigned swz = brw_compose_swizzle(
1980 brw_swizzle_for_size(ir->type->vector_elements),
1981 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1982
1983 ir->val->accept(this);
1984 this->result = swizzle(this->result, swz);
1985 }
1986
1987 void
1988 vec4_visitor::visit(ir_dereference_variable *ir)
1989 {
1990 const struct glsl_type *type = ir->type;
1991 dst_reg *reg = variable_storage(ir->var);
1992
1993 if (!reg) {
1994 fail("Failed to find variable storage for %s\n", ir->var->name);
1995 this->result = src_reg(brw_null_reg());
1996 return;
1997 }
1998
1999 this->result = src_reg(*reg);
2000
2001 /* System values get their swizzle from the dst_reg writemask */
2002 if (ir->var->data.mode == ir_var_system_value)
2003 return;
2004
2005 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2006 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2007 }
2008
2009
2010 int
2011 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2012 {
2013 /* Under normal circumstances array elements are stored consecutively, so
2014 * the stride is equal to the size of the array element.
2015 */
2016 return type_size(ir->type);
2017 }
2018
2019
2020 void
2021 vec4_visitor::visit(ir_dereference_array *ir)
2022 {
2023 ir_constant *constant_index;
2024 src_reg src;
2025 int array_stride = compute_array_stride(ir);
2026
2027 constant_index = ir->array_index->constant_expression_value();
2028
2029 ir->array->accept(this);
2030 src = this->result;
2031
2032 if (constant_index) {
2033 src.reg_offset += constant_index->value.i[0] * array_stride;
2034 } else {
2035 /* Variable index array dereference. It eats the "vec4" of the
2036 * base of the array and an index that offsets the Mesa register
2037 * index.
2038 */
2039 ir->array_index->accept(this);
2040
2041 src_reg index_reg;
2042
2043 if (array_stride == 1) {
2044 index_reg = this->result;
2045 } else {
2046 index_reg = src_reg(this, glsl_type::int_type);
2047
2048 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2049 }
2050
2051 if (src.reladdr) {
2052 src_reg temp = src_reg(this, glsl_type::int_type);
2053
2054 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2055
2056 index_reg = temp;
2057 }
2058
2059 src.reladdr = ralloc(mem_ctx, src_reg);
2060 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2061 }
2062
2063 /* If the type is smaller than a vec4, replicate the last channel out. */
2064 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2065 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2066 else
2067 src.swizzle = BRW_SWIZZLE_NOOP;
2068 src.type = brw_type_for_base_type(ir->type);
2069
2070 this->result = src;
2071 }
2072
2073 void
2074 vec4_visitor::visit(ir_dereference_record *ir)
2075 {
2076 unsigned int i;
2077 const glsl_type *struct_type = ir->record->type;
2078 int offset = 0;
2079
2080 ir->record->accept(this);
2081
2082 for (i = 0; i < struct_type->length; i++) {
2083 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2084 break;
2085 offset += type_size(struct_type->fields.structure[i].type);
2086 }
2087
2088 /* If the type is smaller than a vec4, replicate the last channel out. */
2089 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2090 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2091 else
2092 this->result.swizzle = BRW_SWIZZLE_NOOP;
2093 this->result.type = brw_type_for_base_type(ir->type);
2094
2095 this->result.reg_offset += offset;
2096 }
2097
2098 /**
2099 * We want to be careful in assignment setup to hit the actual storage
2100 * instead of potentially using a temporary like we might with the
2101 * ir_dereference handler.
2102 */
2103 static dst_reg
2104 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2105 {
2106 /* The LHS must be a dereference. If the LHS is a variable indexed array
2107 * access of a vector, it must be separated into a series conditional moves
2108 * before reaching this point (see ir_vec_index_to_cond_assign).
2109 */
2110 assert(ir->as_dereference());
2111 ir_dereference_array *deref_array = ir->as_dereference_array();
2112 if (deref_array) {
2113 assert(!deref_array->array->type->is_vector());
2114 }
2115
2116 /* Use the rvalue deref handler for the most part. We'll ignore
2117 * swizzles in it and write swizzles using writemask, though.
2118 */
2119 ir->accept(v);
2120 return dst_reg(v->result);
2121 }
2122
2123 void
2124 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2125 const struct glsl_type *type,
2126 enum brw_predicate predicate)
2127 {
2128 if (type->base_type == GLSL_TYPE_STRUCT) {
2129 for (unsigned int i = 0; i < type->length; i++) {
2130 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2131 }
2132 return;
2133 }
2134
2135 if (type->is_array()) {
2136 for (unsigned int i = 0; i < type->length; i++) {
2137 emit_block_move(dst, src, type->fields.array, predicate);
2138 }
2139 return;
2140 }
2141
2142 if (type->is_matrix()) {
2143 const struct glsl_type *vec_type;
2144
2145 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2146 type->vector_elements, 1);
2147
2148 for (int i = 0; i < type->matrix_columns; i++) {
2149 emit_block_move(dst, src, vec_type, predicate);
2150 }
2151 return;
2152 }
2153
2154 assert(type->is_scalar() || type->is_vector());
2155
2156 dst->type = brw_type_for_base_type(type);
2157 src->type = dst->type;
2158
2159 dst->writemask = (1 << type->vector_elements) - 1;
2160
2161 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2162
2163 vec4_instruction *inst = emit(MOV(*dst, *src));
2164 inst->predicate = predicate;
2165
2166 dst->reg_offset++;
2167 src->reg_offset++;
2168 }
2169
2170
2171 /* If the RHS processing resulted in an instruction generating a
2172 * temporary value, and it would be easy to rewrite the instruction to
2173 * generate its result right into the LHS instead, do so. This ends
2174 * up reliably removing instructions where it can be tricky to do so
2175 * later without real UD chain information.
2176 */
2177 bool
2178 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2179 dst_reg dst,
2180 src_reg src,
2181 vec4_instruction *pre_rhs_inst,
2182 vec4_instruction *last_rhs_inst)
2183 {
2184 /* This could be supported, but it would take more smarts. */
2185 if (ir->condition)
2186 return false;
2187
2188 if (pre_rhs_inst == last_rhs_inst)
2189 return false; /* No instructions generated to work with. */
2190
2191 /* Make sure the last instruction generated our source reg. */
2192 if (src.file != GRF ||
2193 src.file != last_rhs_inst->dst.file ||
2194 src.reg != last_rhs_inst->dst.reg ||
2195 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2196 src.reladdr ||
2197 src.abs ||
2198 src.negate ||
2199 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2200 return false;
2201
2202 /* Check that that last instruction fully initialized the channels
2203 * we want to use, in the order we want to use them. We could
2204 * potentially reswizzle the operands of many instructions so that
2205 * we could handle out of order channels, but don't yet.
2206 */
2207
2208 for (unsigned i = 0; i < 4; i++) {
2209 if (dst.writemask & (1 << i)) {
2210 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2211 return false;
2212
2213 if (BRW_GET_SWZ(src.swizzle, i) != i)
2214 return false;
2215 }
2216 }
2217
2218 /* Success! Rewrite the instruction. */
2219 last_rhs_inst->dst.file = dst.file;
2220 last_rhs_inst->dst.reg = dst.reg;
2221 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2222 last_rhs_inst->dst.reladdr = dst.reladdr;
2223 last_rhs_inst->dst.writemask &= dst.writemask;
2224
2225 return true;
2226 }
2227
2228 void
2229 vec4_visitor::visit(ir_assignment *ir)
2230 {
2231 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2232 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2233
2234 if (!ir->lhs->type->is_scalar() &&
2235 !ir->lhs->type->is_vector()) {
2236 ir->rhs->accept(this);
2237 src_reg src = this->result;
2238
2239 if (ir->condition) {
2240 emit_bool_to_cond_code(ir->condition, &predicate);
2241 }
2242
2243 /* emit_block_move doesn't account for swizzles in the source register.
2244 * This should be ok, since the source register is a structure or an
2245 * array, and those can't be swizzled. But double-check to be sure.
2246 */
2247 assert(src.swizzle ==
2248 (ir->rhs->type->is_matrix()
2249 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2250 : BRW_SWIZZLE_NOOP));
2251
2252 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2253 return;
2254 }
2255
2256 /* Now we're down to just a scalar/vector with writemasks. */
2257 int i;
2258
2259 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2260 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2261
2262 ir->rhs->accept(this);
2263
2264 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2265
2266 int swizzles[4];
2267 int src_chan = 0;
2268
2269 assert(ir->lhs->type->is_vector() ||
2270 ir->lhs->type->is_scalar());
2271 dst.writemask = ir->write_mask;
2272
2273 /* Swizzle a small RHS vector into the channels being written.
2274 *
2275 * glsl ir treats write_mask as dictating how many channels are
2276 * present on the RHS while in our instructions we need to make
2277 * those channels appear in the slots of the vec4 they're written to.
2278 */
2279 for (int i = 0; i < 4; i++)
2280 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2281
2282 src_reg src = swizzle(this->result,
2283 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2284 swizzles[2], swizzles[3]));
2285
2286 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2287 return;
2288 }
2289
2290 if (ir->condition) {
2291 emit_bool_to_cond_code(ir->condition, &predicate);
2292 }
2293
2294 for (i = 0; i < type_size(ir->lhs->type); i++) {
2295 vec4_instruction *inst = emit(MOV(dst, src));
2296 inst->predicate = predicate;
2297
2298 dst.reg_offset++;
2299 src.reg_offset++;
2300 }
2301 }
2302
2303 void
2304 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2305 {
2306 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2307 foreach_in_list(ir_constant, field_value, &ir->components) {
2308 emit_constant_values(dst, field_value);
2309 }
2310 return;
2311 }
2312
2313 if (ir->type->is_array()) {
2314 for (unsigned int i = 0; i < ir->type->length; i++) {
2315 emit_constant_values(dst, ir->array_elements[i]);
2316 }
2317 return;
2318 }
2319
2320 if (ir->type->is_matrix()) {
2321 for (int i = 0; i < ir->type->matrix_columns; i++) {
2322 float *vec = &ir->value.f[i * ir->type->vector_elements];
2323
2324 for (int j = 0; j < ir->type->vector_elements; j++) {
2325 dst->writemask = 1 << j;
2326 dst->type = BRW_REGISTER_TYPE_F;
2327
2328 emit(MOV(*dst, src_reg(vec[j])));
2329 }
2330 dst->reg_offset++;
2331 }
2332 return;
2333 }
2334
2335 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2336
2337 for (int i = 0; i < ir->type->vector_elements; i++) {
2338 if (!(remaining_writemask & (1 << i)))
2339 continue;
2340
2341 dst->writemask = 1 << i;
2342 dst->type = brw_type_for_base_type(ir->type);
2343
2344 /* Find other components that match the one we're about to
2345 * write. Emits fewer instructions for things like vec4(0.5,
2346 * 1.5, 1.5, 1.5).
2347 */
2348 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2349 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2350 if (ir->value.b[i] == ir->value.b[j])
2351 dst->writemask |= (1 << j);
2352 } else {
2353 /* u, i, and f storage all line up, so no need for a
2354 * switch case for comparing each type.
2355 */
2356 if (ir->value.u[i] == ir->value.u[j])
2357 dst->writemask |= (1 << j);
2358 }
2359 }
2360
2361 switch (ir->type->base_type) {
2362 case GLSL_TYPE_FLOAT:
2363 emit(MOV(*dst, src_reg(ir->value.f[i])));
2364 break;
2365 case GLSL_TYPE_INT:
2366 emit(MOV(*dst, src_reg(ir->value.i[i])));
2367 break;
2368 case GLSL_TYPE_UINT:
2369 emit(MOV(*dst, src_reg(ir->value.u[i])));
2370 break;
2371 case GLSL_TYPE_BOOL:
2372 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2373 break;
2374 default:
2375 unreachable("Non-float/uint/int/bool constant");
2376 }
2377
2378 remaining_writemask &= ~dst->writemask;
2379 }
2380 dst->reg_offset++;
2381 }
2382
2383 void
2384 vec4_visitor::visit(ir_constant *ir)
2385 {
2386 dst_reg dst = dst_reg(this, ir->type);
2387 this->result = src_reg(dst);
2388
2389 emit_constant_values(&dst, ir);
2390 }
2391
2392 void
2393 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2394 {
2395 ir_dereference *deref = static_cast<ir_dereference *>(
2396 ir->actual_parameters.get_head());
2397 ir_variable *location = deref->variable_referenced();
2398 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2399 location->data.binding);
2400
2401 /* Calculate the surface offset */
2402 src_reg offset(this, glsl_type::uint_type);
2403 ir_dereference_array *deref_array = deref->as_dereference_array();
2404 if (deref_array) {
2405 deref_array->array_index->accept(this);
2406
2407 src_reg tmp(this, glsl_type::uint_type);
2408 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2409 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2410 } else {
2411 offset = location->data.atomic.offset;
2412 }
2413
2414 /* Emit the appropriate machine instruction */
2415 const char *callee = ir->callee->function_name();
2416 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2417
2418 if (!strcmp("__intrinsic_atomic_read", callee)) {
2419 emit_untyped_surface_read(surf_index, dst, offset);
2420
2421 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2422 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2423 src_reg(), src_reg());
2424
2425 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2426 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2427 src_reg(), src_reg());
2428 }
2429 }
2430
2431 void
2432 vec4_visitor::visit(ir_call *ir)
2433 {
2434 const char *callee = ir->callee->function_name();
2435
2436 if (!strcmp("__intrinsic_atomic_read", callee) ||
2437 !strcmp("__intrinsic_atomic_increment", callee) ||
2438 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2439 visit_atomic_counter_intrinsic(ir);
2440 } else {
2441 unreachable("Unsupported intrinsic.");
2442 }
2443 }
2444
2445 src_reg
2446 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2447 {
2448 vec4_instruction *inst =
2449 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2450 dst_reg(this, glsl_type::uvec4_type));
2451 inst->base_mrf = 2;
2452 inst->src[1] = sampler;
2453
2454 int param_base;
2455
2456 if (devinfo->gen >= 9) {
2457 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2458 vec4_instruction *header_inst = new(mem_ctx)
2459 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2460 dst_reg(MRF, inst->base_mrf));
2461
2462 emit(header_inst);
2463
2464 inst->mlen = 2;
2465 inst->header_size = 1;
2466 param_base = inst->base_mrf + 1;
2467 } else {
2468 inst->mlen = 1;
2469 param_base = inst->base_mrf;
2470 }
2471
2472 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2473 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2474 int zero_mask = 0xf & ~coord_mask;
2475
2476 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2477 coordinate));
2478
2479 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2480 src_reg(0)));
2481
2482 emit(inst);
2483 return src_reg(inst->dst);
2484 }
2485
2486 static bool
2487 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2488 {
2489 if (devinfo->gen < 8 && !devinfo->is_haswell)
2490 return false;
2491
2492 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2493 }
2494
2495 void
2496 vec4_visitor::visit(ir_texture *ir)
2497 {
2498 uint32_t sampler =
2499 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2500
2501 ir_rvalue *nonconst_sampler_index =
2502 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2503
2504 /* Handle non-constant sampler array indexing */
2505 src_reg sampler_reg;
2506 if (nonconst_sampler_index) {
2507 /* The highest sampler which may be used by this operation is
2508 * the last element of the array. Mark it here, because the generator
2509 * doesn't have enough information to determine the bound.
2510 */
2511 uint32_t array_size = ir->sampler->as_dereference_array()
2512 ->array->type->array_size();
2513
2514 uint32_t max_used = sampler + array_size - 1;
2515 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2516 max_used += prog_data->base.binding_table.gather_texture_start;
2517 } else {
2518 max_used += prog_data->base.binding_table.texture_start;
2519 }
2520
2521 brw_mark_surface_used(&prog_data->base, max_used);
2522
2523 /* Emit code to evaluate the actual indexing expression */
2524 nonconst_sampler_index->accept(this);
2525 dst_reg temp(this, glsl_type::uint_type);
2526 emit(ADD(temp, this->result, src_reg(sampler)));
2527 emit_uniformize(temp, src_reg(temp));
2528
2529 sampler_reg = src_reg(temp);
2530 } else {
2531 /* Single sampler, or constant array index; the indexing expression
2532 * is just an immediate.
2533 */
2534 sampler_reg = src_reg(sampler);
2535 }
2536
2537 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2538 * emitting anything other than setting up the constant result.
2539 */
2540 if (ir->op == ir_tg4) {
2541 ir_constant *chan = ir->lod_info.component->as_constant();
2542 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2543 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2544 dst_reg result(this, ir->type);
2545 this->result = src_reg(result);
2546 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2547 return;
2548 }
2549 }
2550
2551 /* Should be lowered by do_lower_texture_projection */
2552 assert(!ir->projector);
2553
2554 /* Should be lowered */
2555 assert(!ir->offset || !ir->offset->type->is_array());
2556
2557 /* Generate code to compute all the subexpression trees. This has to be
2558 * done before loading any values into MRFs for the sampler message since
2559 * generating these values may involve SEND messages that need the MRFs.
2560 */
2561 src_reg coordinate;
2562 if (ir->coordinate) {
2563 ir->coordinate->accept(this);
2564 coordinate = this->result;
2565 }
2566
2567 src_reg shadow_comparitor;
2568 if (ir->shadow_comparitor) {
2569 ir->shadow_comparitor->accept(this);
2570 shadow_comparitor = this->result;
2571 }
2572
2573 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2574 src_reg offset_value;
2575 if (has_nonconstant_offset) {
2576 ir->offset->accept(this);
2577 offset_value = src_reg(this->result);
2578 }
2579
2580 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2581 src_reg lod, dPdx, dPdy, sample_index, mcs;
2582 switch (ir->op) {
2583 case ir_tex:
2584 lod = src_reg(0.0f);
2585 lod_type = glsl_type::float_type;
2586 break;
2587 case ir_txf:
2588 case ir_txl:
2589 case ir_txs:
2590 ir->lod_info.lod->accept(this);
2591 lod = this->result;
2592 lod_type = ir->lod_info.lod->type;
2593 break;
2594 case ir_query_levels:
2595 lod = src_reg(0);
2596 lod_type = glsl_type::int_type;
2597 break;
2598 case ir_txf_ms:
2599 ir->lod_info.sample_index->accept(this);
2600 sample_index = this->result;
2601 sample_index_type = ir->lod_info.sample_index->type;
2602
2603 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2604 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2605 else
2606 mcs = src_reg(0u);
2607 break;
2608 case ir_txd:
2609 ir->lod_info.grad.dPdx->accept(this);
2610 dPdx = this->result;
2611
2612 ir->lod_info.grad.dPdy->accept(this);
2613 dPdy = this->result;
2614
2615 lod_type = ir->lod_info.grad.dPdx->type;
2616 break;
2617 case ir_txb:
2618 case ir_lod:
2619 case ir_tg4:
2620 break;
2621 }
2622
2623 enum opcode opcode;
2624 switch (ir->op) {
2625 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2626 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2627 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2628 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2629 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2630 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2631 case ir_tg4: opcode = has_nonconstant_offset
2632 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2633 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2634 case ir_txb:
2635 unreachable("TXB is not valid for vertex shaders.");
2636 case ir_lod:
2637 unreachable("LOD is not valid for vertex shaders.");
2638 default:
2639 unreachable("Unrecognized tex op");
2640 }
2641
2642 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2643 opcode, dst_reg(this, ir->type));
2644
2645 if (ir->offset != NULL && !has_nonconstant_offset) {
2646 inst->offset =
2647 brw_texture_offset(ir->offset->as_constant()->value.i,
2648 ir->offset->type->vector_elements);
2649 }
2650
2651 /* Stuff the channel select bits in the top of the texture offset */
2652 if (ir->op == ir_tg4)
2653 inst->offset |= gather_channel(ir, sampler) << 16;
2654
2655 /* The message header is necessary for:
2656 * - Gen4 (always)
2657 * - Gen9+ for selecting SIMD4x2
2658 * - Texel offsets
2659 * - Gather channel selection
2660 * - Sampler indices too large to fit in a 4-bit value.
2661 */
2662 inst->header_size =
2663 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2664 inst->offset != 0 || ir->op == ir_tg4 ||
2665 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2666 inst->base_mrf = 2;
2667 inst->mlen = inst->header_size + 1; /* always at least one */
2668 inst->dst.writemask = WRITEMASK_XYZW;
2669 inst->shadow_compare = ir->shadow_comparitor != NULL;
2670
2671 inst->src[1] = sampler_reg;
2672
2673 /* MRF for the first parameter */
2674 int param_base = inst->base_mrf + inst->header_size;
2675
2676 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2677 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2678 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2679 } else {
2680 /* Load the coordinate */
2681 /* FINISHME: gl_clamp_mask and saturate */
2682 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2683 int zero_mask = 0xf & ~coord_mask;
2684
2685 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2686 coordinate));
2687
2688 if (zero_mask != 0) {
2689 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2690 src_reg(0)));
2691 }
2692 /* Load the shadow comparitor */
2693 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2694 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2695 WRITEMASK_X),
2696 shadow_comparitor));
2697 inst->mlen++;
2698 }
2699
2700 /* Load the LOD info */
2701 if (ir->op == ir_tex || ir->op == ir_txl) {
2702 int mrf, writemask;
2703 if (devinfo->gen >= 5) {
2704 mrf = param_base + 1;
2705 if (ir->shadow_comparitor) {
2706 writemask = WRITEMASK_Y;
2707 /* mlen already incremented */
2708 } else {
2709 writemask = WRITEMASK_X;
2710 inst->mlen++;
2711 }
2712 } else /* devinfo->gen == 4 */ {
2713 mrf = param_base;
2714 writemask = WRITEMASK_W;
2715 }
2716 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2717 } else if (ir->op == ir_txf) {
2718 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2719 } else if (ir->op == ir_txf_ms) {
2720 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2721 sample_index));
2722 if (devinfo->gen >= 7) {
2723 /* MCS data is in the first channel of `mcs`, but we need to get it into
2724 * the .y channel of the second vec4 of params, so replicate .x across
2725 * the whole vec4 and then mask off everything except .y
2726 */
2727 mcs.swizzle = BRW_SWIZZLE_XXXX;
2728 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2729 mcs));
2730 }
2731 inst->mlen++;
2732 } else if (ir->op == ir_txd) {
2733 const glsl_type *type = lod_type;
2734
2735 if (devinfo->gen >= 5) {
2736 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2737 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2738 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2739 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2740 inst->mlen++;
2741
2742 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2743 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2744 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2745 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2746 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2747 inst->mlen++;
2748
2749 if (ir->shadow_comparitor) {
2750 emit(MOV(dst_reg(MRF, param_base + 2,
2751 ir->shadow_comparitor->type, WRITEMASK_Z),
2752 shadow_comparitor));
2753 }
2754 }
2755 } else /* devinfo->gen == 4 */ {
2756 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2757 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2758 inst->mlen += 2;
2759 }
2760 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2761 if (ir->shadow_comparitor) {
2762 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2763 shadow_comparitor));
2764 }
2765
2766 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2767 offset_value));
2768 inst->mlen++;
2769 }
2770 }
2771
2772 emit(inst);
2773
2774 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2775 * spec requires layers.
2776 */
2777 if (ir->op == ir_txs) {
2778 glsl_type const *type = ir->sampler->type;
2779 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2780 type->sampler_array) {
2781 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2782 writemask(inst->dst, WRITEMASK_Z),
2783 src_reg(inst->dst), src_reg(6));
2784 }
2785 }
2786
2787 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2788 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2789 }
2790
2791 swizzle_result(ir, src_reg(inst->dst), sampler);
2792 }
2793
2794 /**
2795 * Apply workarounds for Gen6 gather with UINT/SINT
2796 */
2797 void
2798 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2799 {
2800 if (!wa)
2801 return;
2802
2803 int width = (wa & WA_8BIT) ? 8 : 16;
2804 dst_reg dst_f = dst;
2805 dst_f.type = BRW_REGISTER_TYPE_F;
2806
2807 /* Convert from UNORM to UINT */
2808 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2809 emit(MOV(dst, src_reg(dst_f)));
2810
2811 if (wa & WA_SIGN) {
2812 /* Reinterpret the UINT value as a signed INT value by
2813 * shifting the sign bit into place, then shifting back
2814 * preserving sign.
2815 */
2816 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2817 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2818 }
2819 }
2820
2821 /**
2822 * Set up the gather channel based on the swizzle, for gather4.
2823 */
2824 uint32_t
2825 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2826 {
2827 ir_constant *chan = ir->lod_info.component->as_constant();
2828 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2829 switch (swiz) {
2830 case SWIZZLE_X: return 0;
2831 case SWIZZLE_Y:
2832 /* gather4 sampler is broken for green channel on RG32F --
2833 * we must ask for blue instead.
2834 */
2835 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2836 return 2;
2837 return 1;
2838 case SWIZZLE_Z: return 2;
2839 case SWIZZLE_W: return 3;
2840 default:
2841 unreachable("Not reached"); /* zero, one swizzles handled already */
2842 }
2843 }
2844
2845 void
2846 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2847 {
2848 int s = key->tex.swizzles[sampler];
2849
2850 this->result = src_reg(this, ir->type);
2851 dst_reg swizzled_result(this->result);
2852
2853 if (ir->op == ir_query_levels) {
2854 /* # levels is in .w */
2855 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2856 emit(MOV(swizzled_result, orig_val));
2857 return;
2858 }
2859
2860 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2861 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2862 emit(MOV(swizzled_result, orig_val));
2863 return;
2864 }
2865
2866
2867 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2868 int swizzle[4] = {0};
2869
2870 for (int i = 0; i < 4; i++) {
2871 switch (GET_SWZ(s, i)) {
2872 case SWIZZLE_ZERO:
2873 zero_mask |= (1 << i);
2874 break;
2875 case SWIZZLE_ONE:
2876 one_mask |= (1 << i);
2877 break;
2878 default:
2879 copy_mask |= (1 << i);
2880 swizzle[i] = GET_SWZ(s, i);
2881 break;
2882 }
2883 }
2884
2885 if (copy_mask) {
2886 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2887 swizzled_result.writemask = copy_mask;
2888 emit(MOV(swizzled_result, orig_val));
2889 }
2890
2891 if (zero_mask) {
2892 swizzled_result.writemask = zero_mask;
2893 emit(MOV(swizzled_result, src_reg(0.0f)));
2894 }
2895
2896 if (one_mask) {
2897 swizzled_result.writemask = one_mask;
2898 emit(MOV(swizzled_result, src_reg(1.0f)));
2899 }
2900 }
2901
2902 void
2903 vec4_visitor::visit(ir_return *)
2904 {
2905 unreachable("not reached");
2906 }
2907
2908 void
2909 vec4_visitor::visit(ir_discard *)
2910 {
2911 unreachable("not reached");
2912 }
2913
2914 void
2915 vec4_visitor::visit(ir_if *ir)
2916 {
2917 /* Don't point the annotation at the if statement, because then it plus
2918 * the then and else blocks get printed.
2919 */
2920 this->base_ir = ir->condition;
2921
2922 if (devinfo->gen == 6) {
2923 emit_if_gen6(ir);
2924 } else {
2925 enum brw_predicate predicate;
2926 emit_bool_to_cond_code(ir->condition, &predicate);
2927 emit(IF(predicate));
2928 }
2929
2930 visit_instructions(&ir->then_instructions);
2931
2932 if (!ir->else_instructions.is_empty()) {
2933 this->base_ir = ir->condition;
2934 emit(BRW_OPCODE_ELSE);
2935
2936 visit_instructions(&ir->else_instructions);
2937 }
2938
2939 this->base_ir = ir->condition;
2940 emit(BRW_OPCODE_ENDIF);
2941 }
2942
2943 void
2944 vec4_visitor::visit(ir_emit_vertex *)
2945 {
2946 unreachable("not reached");
2947 }
2948
2949 void
2950 vec4_visitor::visit(ir_end_primitive *)
2951 {
2952 unreachable("not reached");
2953 }
2954
2955 void
2956 vec4_visitor::visit(ir_barrier *)
2957 {
2958 unreachable("not reached");
2959 }
2960
2961 void
2962 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2963 dst_reg dst, src_reg offset,
2964 src_reg src0, src_reg src1)
2965 {
2966 unsigned mlen = 0;
2967
2968 /* Set the atomic operation offset. */
2969 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2970 mlen++;
2971
2972 /* Set the atomic operation arguments. */
2973 if (src0.file != BAD_FILE) {
2974 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2975 mlen++;
2976 }
2977
2978 if (src1.file != BAD_FILE) {
2979 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2980 mlen++;
2981 }
2982
2983 /* Emit the instruction. Note that this maps to the normal SIMD8
2984 * untyped atomic message on Ivy Bridge, but that's OK because
2985 * unused channels will be masked out.
2986 */
2987 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2988 brw_message_reg(0),
2989 src_reg(surf_index), src_reg(atomic_op));
2990 inst->mlen = mlen;
2991 }
2992
2993 void
2994 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2995 src_reg offset)
2996 {
2997 /* Set the surface read offset. */
2998 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2999
3000 /* Emit the instruction. Note that this maps to the normal SIMD8
3001 * untyped surface read message, but that's OK because unused
3002 * channels will be masked out.
3003 */
3004 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3005 brw_message_reg(0),
3006 src_reg(surf_index), src_reg(1));
3007 inst->mlen = 1;
3008 }
3009
3010 void
3011 vec4_visitor::emit_ndc_computation()
3012 {
3013 /* Get the position */
3014 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3015
3016 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3017 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3018 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3019
3020 current_annotation = "NDC";
3021 dst_reg ndc_w = ndc;
3022 ndc_w.writemask = WRITEMASK_W;
3023 src_reg pos_w = pos;
3024 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3025 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3026
3027 dst_reg ndc_xyz = ndc;
3028 ndc_xyz.writemask = WRITEMASK_XYZ;
3029
3030 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3031 }
3032
3033 void
3034 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3035 {
3036 if (devinfo->gen < 6 &&
3037 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3038 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3039 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3040 dst_reg header1_w = header1;
3041 header1_w.writemask = WRITEMASK_W;
3042
3043 emit(MOV(header1, 0u));
3044
3045 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3046 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3047
3048 current_annotation = "Point size";
3049 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3050 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3051 }
3052
3053 if (key->userclip_active) {
3054 current_annotation = "Clipping flags";
3055 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3056 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3057
3058 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3059 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3060 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3061
3062 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3063 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3064 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3065 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3066 }
3067
3068 /* i965 clipping workaround:
3069 * 1) Test for -ve rhw
3070 * 2) If set,
3071 * set ndc = (0,0,0,0)
3072 * set ucp[6] = 1
3073 *
3074 * Later, clipping will detect ucp[6] and ensure the primitive is
3075 * clipped against all fixed planes.
3076 */
3077 if (devinfo->has_negative_rhw_bug) {
3078 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3079 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3080 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3081 vec4_instruction *inst;
3082 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3083 inst->predicate = BRW_PREDICATE_NORMAL;
3084 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3085 inst->predicate = BRW_PREDICATE_NORMAL;
3086 }
3087
3088 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3089 } else if (devinfo->gen < 6) {
3090 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3091 } else {
3092 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3093 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3094 dst_reg reg_w = reg;
3095 reg_w.writemask = WRITEMASK_W;
3096 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3097 }
3098 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3099 dst_reg reg_y = reg;
3100 reg_y.writemask = WRITEMASK_Y;
3101 reg_y.type = BRW_REGISTER_TYPE_D;
3102 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3103 }
3104 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3105 dst_reg reg_z = reg;
3106 reg_z.writemask = WRITEMASK_Z;
3107 reg_z.type = BRW_REGISTER_TYPE_D;
3108 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3109 }
3110 }
3111 }
3112
3113 void
3114 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3115 {
3116 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3117 *
3118 * "If a linked set of shaders forming the vertex stage contains no
3119 * static write to gl_ClipVertex or gl_ClipDistance, but the
3120 * application has requested clipping against user clip planes through
3121 * the API, then the coordinate written to gl_Position is used for
3122 * comparison against the user clip planes."
3123 *
3124 * This function is only called if the shader didn't write to
3125 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3126 * if the user wrote to it; otherwise we use gl_Position.
3127 */
3128 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3129 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3130 clip_vertex = VARYING_SLOT_POS;
3131 }
3132
3133 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3134 ++i) {
3135 reg.writemask = 1 << i;
3136 emit(DP4(reg,
3137 src_reg(output_reg[clip_vertex]),
3138 src_reg(this->userplane[i + offset])));
3139 }
3140 }
3141
3142 vec4_instruction *
3143 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3144 {
3145 assert (varying < VARYING_SLOT_MAX);
3146 reg.type = output_reg[varying].type;
3147 current_annotation = output_reg_annotation[varying];
3148 /* Copy the register, saturating if necessary */
3149 return emit(MOV(reg, src_reg(output_reg[varying])));
3150 }
3151
3152 void
3153 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3154 {
3155 reg.type = BRW_REGISTER_TYPE_F;
3156
3157 switch (varying) {
3158 case VARYING_SLOT_PSIZ:
3159 {
3160 /* PSIZ is always in slot 0, and is coupled with other flags. */
3161 current_annotation = "indices, point width, clip flags";
3162 emit_psiz_and_flags(reg);
3163 break;
3164 }
3165 case BRW_VARYING_SLOT_NDC:
3166 current_annotation = "NDC";
3167 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3168 break;
3169 case VARYING_SLOT_POS:
3170 current_annotation = "gl_Position";
3171 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3172 break;
3173 case VARYING_SLOT_EDGE:
3174 /* This is present when doing unfilled polygons. We're supposed to copy
3175 * the edge flag from the user-provided vertex array
3176 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3177 * of that attribute (starts as 1.0f). This is then used in clipping to
3178 * determine which edges should be drawn as wireframe.
3179 */
3180 current_annotation = "edge flag";
3181 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3182 glsl_type::float_type, WRITEMASK_XYZW))));
3183 break;
3184 case BRW_VARYING_SLOT_PAD:
3185 /* No need to write to this slot */
3186 break;
3187 case VARYING_SLOT_COL0:
3188 case VARYING_SLOT_COL1:
3189 case VARYING_SLOT_BFC0:
3190 case VARYING_SLOT_BFC1: {
3191 /* These built-in varyings are only supported in compatibility mode,
3192 * and we only support GS in core profile. So, this must be a vertex
3193 * shader.
3194 */
3195 assert(stage == MESA_SHADER_VERTEX);
3196 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3197 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3198 inst->saturate = true;
3199 break;
3200 }
3201
3202 default:
3203 emit_generic_urb_slot(reg, varying);
3204 break;
3205 }
3206 }
3207
3208 static int
3209 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3210 {
3211 if (devinfo->gen >= 6) {
3212 /* URB data written (does not include the message header reg) must
3213 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3214 * section 5.4.3.2.2: URB_INTERLEAVED.
3215 *
3216 * URB entries are allocated on a multiple of 1024 bits, so an
3217 * extra 128 bits written here to make the end align to 256 is
3218 * no problem.
3219 */
3220 if ((mlen % 2) != 1)
3221 mlen++;
3222 }
3223
3224 return mlen;
3225 }
3226
3227
3228 /**
3229 * Generates the VUE payload plus the necessary URB write instructions to
3230 * output it.
3231 *
3232 * The VUE layout is documented in Volume 2a.
3233 */
3234 void
3235 vec4_visitor::emit_vertex()
3236 {
3237 /* MRF 0 is reserved for the debugger, so start with message header
3238 * in MRF 1.
3239 */
3240 int base_mrf = 1;
3241 int mrf = base_mrf;
3242 /* In the process of generating our URB write message contents, we
3243 * may need to unspill a register or load from an array. Those
3244 * reads would use MRFs 14-15.
3245 */
3246 int max_usable_mrf = 13;
3247
3248 /* The following assertion verifies that max_usable_mrf causes an
3249 * even-numbered amount of URB write data, which will meet gen6's
3250 * requirements for length alignment.
3251 */
3252 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3253
3254 /* First mrf is the g0-based message header containing URB handles and
3255 * such.
3256 */
3257 emit_urb_write_header(mrf++);
3258
3259 if (devinfo->gen < 6) {
3260 emit_ndc_computation();
3261 }
3262
3263 /* Lower legacy ff and ClipVertex clipping to clip distances */
3264 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3265 current_annotation = "user clip distances";
3266
3267 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3268 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3269
3270 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3271 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3272 }
3273
3274 /* We may need to split this up into several URB writes, so do them in a
3275 * loop.
3276 */
3277 int slot = 0;
3278 bool complete = false;
3279 do {
3280 /* URB offset is in URB row increments, and each of our MRFs is half of
3281 * one of those, since we're doing interleaved writes.
3282 */
3283 int offset = slot / 2;
3284
3285 mrf = base_mrf + 1;
3286 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3287 emit_urb_slot(dst_reg(MRF, mrf++),
3288 prog_data->vue_map.slot_to_varying[slot]);
3289
3290 /* If this was max_usable_mrf, we can't fit anything more into this
3291 * URB WRITE.
3292 */
3293 if (mrf > max_usable_mrf) {
3294 slot++;
3295 break;
3296 }
3297 }
3298
3299 complete = slot >= prog_data->vue_map.num_slots;
3300 current_annotation = "URB write";
3301 vec4_instruction *inst = emit_urb_write_opcode(complete);
3302 inst->base_mrf = base_mrf;
3303 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3304 inst->offset += offset;
3305 } while(!complete);
3306 }
3307
3308
3309 src_reg
3310 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3311 src_reg *reladdr, int reg_offset)
3312 {
3313 /* Because we store the values to scratch interleaved like our
3314 * vertex data, we need to scale the vec4 index by 2.
3315 */
3316 int message_header_scale = 2;
3317
3318 /* Pre-gen6, the message header uses byte offsets instead of vec4
3319 * (16-byte) offset units.
3320 */
3321 if (devinfo->gen < 6)
3322 message_header_scale *= 16;
3323
3324 if (reladdr) {
3325 src_reg index = src_reg(this, glsl_type::int_type);
3326
3327 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3328 src_reg(reg_offset)));
3329 emit_before(block, inst, MUL(dst_reg(index), index,
3330 src_reg(message_header_scale)));
3331
3332 return index;
3333 } else {
3334 return src_reg(reg_offset * message_header_scale);
3335 }
3336 }
3337
3338 src_reg
3339 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3340 src_reg *reladdr, int reg_offset)
3341 {
3342 if (reladdr) {
3343 src_reg index = src_reg(this, glsl_type::int_type);
3344
3345 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3346 src_reg(reg_offset)));
3347
3348 /* Pre-gen6, the message header uses byte offsets instead of vec4
3349 * (16-byte) offset units.
3350 */
3351 if (devinfo->gen < 6) {
3352 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3353 }
3354
3355 return index;
3356 } else if (devinfo->gen >= 8) {
3357 /* Store the offset in a GRF so we can send-from-GRF. */
3358 src_reg offset = src_reg(this, glsl_type::int_type);
3359 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3360 return offset;
3361 } else {
3362 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3363 return src_reg(reg_offset * message_header_scale);
3364 }
3365 }
3366
3367 /**
3368 * Emits an instruction before @inst to load the value named by @orig_src
3369 * from scratch space at @base_offset to @temp.
3370 *
3371 * @base_offset is measured in 32-byte units (the size of a register).
3372 */
3373 void
3374 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3375 dst_reg temp, src_reg orig_src,
3376 int base_offset)
3377 {
3378 int reg_offset = base_offset + orig_src.reg_offset;
3379 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3380 reg_offset);
3381
3382 emit_before(block, inst, SCRATCH_READ(temp, index));
3383 }
3384
3385 /**
3386 * Emits an instruction after @inst to store the value to be written
3387 * to @orig_dst to scratch space at @base_offset, from @temp.
3388 *
3389 * @base_offset is measured in 32-byte units (the size of a register).
3390 */
3391 void
3392 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3393 int base_offset)
3394 {
3395 int reg_offset = base_offset + inst->dst.reg_offset;
3396 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3397 reg_offset);
3398
3399 /* Create a temporary register to store *inst's result in.
3400 *
3401 * We have to be careful in MOVing from our temporary result register in
3402 * the scratch write. If we swizzle from channels of the temporary that
3403 * weren't initialized, it will confuse live interval analysis, which will
3404 * make spilling fail to make progress.
3405 */
3406 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3407 inst->dst.type),
3408 brw_swizzle_for_mask(inst->dst.writemask));
3409 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3410 inst->dst.writemask));
3411 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3412 write->predicate = inst->predicate;
3413 write->ir = inst->ir;
3414 write->annotation = inst->annotation;
3415 inst->insert_after(block, write);
3416
3417 inst->dst.file = temp.file;
3418 inst->dst.reg = temp.reg;
3419 inst->dst.reg_offset = temp.reg_offset;
3420 inst->dst.reladdr = NULL;
3421 }
3422
3423 /**
3424 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3425 * adds the scratch read(s) before \p inst. The function also checks for
3426 * recursive reladdr scratch accesses, issuing the corresponding scratch
3427 * loads and rewriting reladdr references accordingly.
3428 *
3429 * \return \p src if it did not require a scratch load, otherwise, the
3430 * register holding the result of the scratch load that the caller should
3431 * use to rewrite src.
3432 */
3433 src_reg
3434 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3435 vec4_instruction *inst, src_reg src)
3436 {
3437 /* Resolve recursive reladdr scratch access by calling ourselves
3438 * with src.reladdr
3439 */
3440 if (src.reladdr)
3441 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3442 *src.reladdr);
3443
3444 /* Now handle scratch access on src */
3445 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3446 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3447 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3448 src.reg = temp.reg;
3449 src.reg_offset = temp.reg_offset;
3450 src.reladdr = NULL;
3451 }
3452
3453 return src;
3454 }
3455
3456 /**
3457 * We can't generally support array access in GRF space, because a
3458 * single instruction's destination can only span 2 contiguous
3459 * registers. So, we send all GRF arrays that get variable index
3460 * access to scratch space.
3461 */
3462 void
3463 vec4_visitor::move_grf_array_access_to_scratch()
3464 {
3465 int scratch_loc[this->alloc.count];
3466 memset(scratch_loc, -1, sizeof(scratch_loc));
3467
3468 /* First, calculate the set of virtual GRFs that need to be punted
3469 * to scratch due to having any array access on them, and where in
3470 * scratch.
3471 */
3472 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3473 if (inst->dst.file == GRF && inst->dst.reladdr) {
3474 if (scratch_loc[inst->dst.reg] == -1) {
3475 scratch_loc[inst->dst.reg] = last_scratch;
3476 last_scratch += this->alloc.sizes[inst->dst.reg];
3477 }
3478
3479 for (src_reg *iter = inst->dst.reladdr;
3480 iter->reladdr;
3481 iter = iter->reladdr) {
3482 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3483 scratch_loc[iter->reg] = last_scratch;
3484 last_scratch += this->alloc.sizes[iter->reg];
3485 }
3486 }
3487 }
3488
3489 for (int i = 0 ; i < 3; i++) {
3490 for (src_reg *iter = &inst->src[i];
3491 iter->reladdr;
3492 iter = iter->reladdr) {
3493 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3494 scratch_loc[iter->reg] = last_scratch;
3495 last_scratch += this->alloc.sizes[iter->reg];
3496 }
3497 }
3498 }
3499 }
3500
3501 /* Now, for anything that will be accessed through scratch, rewrite
3502 * it to load/store. Note that this is a _safe list walk, because
3503 * we may generate a new scratch_write instruction after the one
3504 * we're processing.
3505 */
3506 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3507 /* Set up the annotation tracking for new generated instructions. */
3508 base_ir = inst->ir;
3509 current_annotation = inst->annotation;
3510
3511 /* First handle scratch access on the dst. Notice we have to handle
3512 * the case where the dst's reladdr also points to scratch space.
3513 */
3514 if (inst->dst.reladdr)
3515 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3516 *inst->dst.reladdr);
3517
3518 /* Now that we have handled any (possibly recursive) reladdr scratch
3519 * accesses for dst we can safely do the scratch write for dst itself
3520 */
3521 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3522 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3523
3524 /* Now handle scratch access on any src. In this case, since inst->src[i]
3525 * already is a src_reg, we can just call emit_resolve_reladdr with
3526 * inst->src[i] and it will take care of handling scratch loads for
3527 * both src and src.reladdr (recursively).
3528 */
3529 for (int i = 0 ; i < 3; i++) {
3530 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3531 inst->src[i]);
3532 }
3533 }
3534 }
3535
3536 /**
3537 * Emits an instruction before @inst to load the value named by @orig_src
3538 * from the pull constant buffer (surface) at @base_offset to @temp.
3539 */
3540 void
3541 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3542 dst_reg temp, src_reg orig_src,
3543 int base_offset)
3544 {
3545 int reg_offset = base_offset + orig_src.reg_offset;
3546 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3547 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3548 reg_offset);
3549
3550 emit_pull_constant_load_reg(temp,
3551 index,
3552 offset,
3553 block, inst);
3554 }
3555
3556 /**
3557 * Implements array access of uniforms by inserting a
3558 * PULL_CONSTANT_LOAD instruction.
3559 *
3560 * Unlike temporary GRF array access (where we don't support it due to
3561 * the difficulty of doing relative addressing on instruction
3562 * destinations), we could potentially do array access of uniforms
3563 * that were loaded in GRF space as push constants. In real-world
3564 * usage we've seen, though, the arrays being used are always larger
3565 * than we could load as push constants, so just always move all
3566 * uniform array access out to a pull constant buffer.
3567 */
3568 void
3569 vec4_visitor::move_uniform_array_access_to_pull_constants()
3570 {
3571 int pull_constant_loc[this->uniforms];
3572 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3573 bool nested_reladdr;
3574
3575 /* Walk through and find array access of uniforms. Put a copy of that
3576 * uniform in the pull constant buffer.
3577 *
3578 * Note that we don't move constant-indexed accesses to arrays. No
3579 * testing has been done of the performance impact of this choice.
3580 */
3581 do {
3582 nested_reladdr = false;
3583
3584 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3585 for (int i = 0 ; i < 3; i++) {
3586 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3587 continue;
3588
3589 int uniform = inst->src[i].reg;
3590
3591 if (inst->src[i].reladdr->reladdr)
3592 nested_reladdr = true; /* will need another pass */
3593
3594 /* If this array isn't already present in the pull constant buffer,
3595 * add it.
3596 */
3597 if (pull_constant_loc[uniform] == -1) {
3598 const gl_constant_value **values =
3599 &stage_prog_data->param[uniform * 4];
3600
3601 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3602
3603 assert(uniform < uniform_array_size);
3604 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3605 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3606 = values[j];
3607 }
3608 }
3609
3610 /* Set up the annotation tracking for new generated instructions. */
3611 base_ir = inst->ir;
3612 current_annotation = inst->annotation;
3613
3614 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3615
3616 emit_pull_constant_load(block, inst, temp, inst->src[i],
3617 pull_constant_loc[uniform]);
3618
3619 inst->src[i].file = temp.file;
3620 inst->src[i].reg = temp.reg;
3621 inst->src[i].reg_offset = temp.reg_offset;
3622 inst->src[i].reladdr = NULL;
3623 }
3624 }
3625 } while (nested_reladdr);
3626
3627 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3628 * no need to track them as larger-than-vec4 objects. This will be
3629 * relied on in cutting out unused uniform vectors from push
3630 * constants.
3631 */
3632 split_uniform_registers();
3633 }
3634
3635 void
3636 vec4_visitor::resolve_ud_negate(src_reg *reg)
3637 {
3638 if (reg->type != BRW_REGISTER_TYPE_UD ||
3639 !reg->negate)
3640 return;
3641
3642 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3643 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3644 *reg = temp;
3645 }
3646
3647 /**
3648 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3649 *
3650 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3651 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3652 */
3653 void
3654 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3655 {
3656 assert(devinfo->gen <= 5);
3657
3658 if (!rvalue->type->is_boolean())
3659 return;
3660
3661 src_reg and_result = src_reg(this, rvalue->type);
3662 src_reg neg_result = src_reg(this, rvalue->type);
3663 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3664 emit(MOV(dst_reg(neg_result), negate(and_result)));
3665 *reg = neg_result;
3666 }
3667
3668 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3669 void *log_data,
3670 struct gl_program *prog,
3671 const struct brw_vue_prog_key *key,
3672 struct brw_vue_prog_data *prog_data,
3673 struct gl_shader_program *shader_prog,
3674 gl_shader_stage stage,
3675 void *mem_ctx,
3676 bool no_spills,
3677 int shader_time_index)
3678 : backend_shader(compiler, log_data, mem_ctx,
3679 shader_prog, prog, &prog_data->base, stage),
3680 key(key),
3681 prog_data(prog_data),
3682 sanity_param_count(0),
3683 fail_msg(NULL),
3684 first_non_payload_grf(0),
3685 need_all_constants_in_pull_buffer(false),
3686 no_spills(no_spills),
3687 shader_time_index(shader_time_index),
3688 last_scratch(0)
3689 {
3690 this->failed = false;
3691
3692 this->base_ir = NULL;
3693 this->current_annotation = NULL;
3694 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3695
3696 this->variable_ht = hash_table_ctor(0,
3697 hash_table_pointer_hash,
3698 hash_table_pointer_compare);
3699
3700 this->virtual_grf_start = NULL;
3701 this->virtual_grf_end = NULL;
3702 this->live_intervals = NULL;
3703
3704 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3705
3706 this->uniforms = 0;
3707
3708 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3709 * at least one. See setup_uniforms() in brw_vec4.cpp.
3710 */
3711 this->uniform_array_size = 1;
3712 if (prog_data) {
3713 this->uniform_array_size =
3714 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3715 }
3716
3717 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3718 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3719 }
3720
3721 vec4_visitor::~vec4_visitor()
3722 {
3723 hash_table_dtor(this->variable_ht);
3724 }
3725
3726
3727 void
3728 vec4_visitor::fail(const char *format, ...)
3729 {
3730 va_list va;
3731 char *msg;
3732
3733 if (failed)
3734 return;
3735
3736 failed = true;
3737
3738 va_start(va, format);
3739 msg = ralloc_vasprintf(mem_ctx, format, va);
3740 va_end(va);
3741 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3742
3743 this->fail_msg = msg;
3744
3745 if (debug_enabled) {
3746 fprintf(stderr, "%s", msg);
3747 }
3748 }
3749
3750 } /* namespace brw */