i965/vec4: Use same type for immediate, for compaction.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (strncmp(ir->name, storage->name, namelen) != 0 ||
690 (storage->name[namelen] != 0 &&
691 storage->name[namelen] != '.' &&
692 storage->name[namelen] != '[')) {
693 continue;
694 }
695
696 gl_constant_value *components = storage->storage;
697 unsigned vector_count = (MAX2(storage->array_elements, 1) *
698 storage->type->matrix_columns);
699
700 for (unsigned s = 0; s < vector_count; s++) {
701 assert(uniforms < uniform_array_size);
702 uniform_vector_size[uniforms] = storage->type->vector_elements;
703
704 int i;
705 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
706 stage_prog_data->param[uniforms * 4 + i] = components;
707 components++;
708 }
709 for (; i < 4; i++) {
710 static gl_constant_value zero = { 0.0 };
711 stage_prog_data->param[uniforms * 4 + i] = &zero;
712 }
713
714 uniforms++;
715 }
716 }
717 }
718
719 void
720 vec4_visitor::setup_uniform_clipplane_values()
721 {
722 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
723
724 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 4;
727 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
728 this->userplane[i].type = BRW_REGISTER_TYPE_F;
729 for (int j = 0; j < 4; ++j) {
730 stage_prog_data->param[this->uniforms * 4 + j] =
731 (gl_constant_value *) &clip_planes[i][j];
732 }
733 ++this->uniforms;
734 }
735 }
736
737 /* Our support for builtin uniforms is even scarier than non-builtin.
738 * It sits on top of the PROG_STATE_VAR parameters that are
739 * automatically updated from GL context state.
740 */
741 void
742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
743 {
744 const ir_state_slot *const slots = ir->get_state_slots();
745 assert(slots != NULL);
746
747 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
748 /* This state reference has already been setup by ir_to_mesa,
749 * but we'll get the same index back here. We can reference
750 * ParameterValues directly, since unlike brw_fs.cpp, we never
751 * add new state references during compile.
752 */
753 int index = _mesa_add_state_reference(this->prog->Parameters,
754 (gl_state_index *)slots[i].tokens);
755 gl_constant_value *values =
756 &this->prog->Parameters->ParameterValues[index][0];
757
758 assert(this->uniforms < uniform_array_size);
759
760 for (unsigned j = 0; j < 4; j++)
761 stage_prog_data->param[this->uniforms * 4 + j] =
762 &values[GET_SWZ(slots[i].swizzle, j)];
763
764 this->uniform_vector_size[this->uniforms] =
765 (ir->type->is_scalar() || ir->type->is_vector() ||
766 ir->type->is_matrix() ? ir->type->vector_elements : 4);
767
768 this->uniforms++;
769 }
770 }
771
772 dst_reg *
773 vec4_visitor::variable_storage(ir_variable *var)
774 {
775 return (dst_reg *)hash_table_find(this->variable_ht, var);
776 }
777
778 void
779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
780 enum brw_predicate *predicate)
781 {
782 ir_expression *expr = ir->as_expression();
783
784 *predicate = BRW_PREDICATE_NORMAL;
785
786 if (expr && expr->operation != ir_binop_ubo_load) {
787 src_reg op[3];
788 vec4_instruction *inst;
789
790 assert(expr->get_num_operands() <= 3);
791 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
792 expr->operands[i]->accept(this);
793 op[i] = this->result;
794
795 resolve_ud_negate(&op[i]);
796 }
797
798 switch (expr->operation) {
799 case ir_unop_logic_not:
800 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
801 inst->conditional_mod = BRW_CONDITIONAL_Z;
802 break;
803
804 case ir_binop_logic_xor:
805 if (devinfo->gen <= 5) {
806 src_reg temp = src_reg(this, ir->type);
807 emit(XOR(dst_reg(temp), op[0], op[1]));
808 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
809 } else {
810 inst = emit(XOR(dst_null_d(), op[0], op[1]));
811 }
812 inst->conditional_mod = BRW_CONDITIONAL_NZ;
813 break;
814
815 case ir_binop_logic_or:
816 if (devinfo->gen <= 5) {
817 src_reg temp = src_reg(this, ir->type);
818 emit(OR(dst_reg(temp), op[0], op[1]));
819 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
820 } else {
821 inst = emit(OR(dst_null_d(), op[0], op[1]));
822 }
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_binop_logic_and:
827 if (devinfo->gen <= 5) {
828 src_reg temp = src_reg(this, ir->type);
829 emit(AND(dst_reg(temp), op[0], op[1]));
830 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
831 } else {
832 inst = emit(AND(dst_null_d(), op[0], op[1]));
833 }
834 inst->conditional_mod = BRW_CONDITIONAL_NZ;
835 break;
836
837 case ir_unop_f2b:
838 if (devinfo->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_f(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_unop_i2b:
847 if (devinfo->gen >= 6) {
848 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
849 } else {
850 inst = emit(MOV(dst_null_d(), op[0]));
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 }
853 break;
854
855 case ir_binop_all_equal:
856 if (devinfo->gen <= 5) {
857 resolve_bool_comparison(expr->operands[0], &op[0]);
858 resolve_bool_comparison(expr->operands[1], &op[1]);
859 }
860 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
861 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
862 break;
863
864 case ir_binop_any_nequal:
865 if (devinfo->gen <= 5) {
866 resolve_bool_comparison(expr->operands[0], &op[0]);
867 resolve_bool_comparison(expr->operands[1], &op[1]);
868 }
869 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
870 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
871 break;
872
873 case ir_unop_any:
874 if (devinfo->gen <= 5) {
875 resolve_bool_comparison(expr->operands[0], &op[0]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
878 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
879 break;
880
881 case ir_binop_greater:
882 case ir_binop_gequal:
883 case ir_binop_less:
884 case ir_binop_lequal:
885 case ir_binop_equal:
886 case ir_binop_nequal:
887 if (devinfo->gen <= 5) {
888 resolve_bool_comparison(expr->operands[0], &op[0]);
889 resolve_bool_comparison(expr->operands[1], &op[1]);
890 }
891 emit(CMP(dst_null_d(), op[0], op[1],
892 brw_conditional_for_comparison(expr->operation)));
893 break;
894
895 case ir_triop_csel: {
896 /* Expand the boolean condition into the flag register. */
897 inst = emit(MOV(dst_null_d(), op[0]));
898 inst->conditional_mod = BRW_CONDITIONAL_NZ;
899
900 /* Select which boolean to return. */
901 dst_reg temp(this, expr->operands[1]->type);
902 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
903 inst->predicate = BRW_PREDICATE_NORMAL;
904
905 /* Expand the result to a condition code. */
906 inst = emit(MOV(dst_null_d(), src_reg(temp)));
907 inst->conditional_mod = BRW_CONDITIONAL_NZ;
908 break;
909 }
910
911 default:
912 unreachable("not reached");
913 }
914 return;
915 }
916
917 ir->accept(this);
918
919 resolve_ud_negate(&this->result);
920
921 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923 }
924
925 /**
926 * Emit a gen6 IF statement with the comparison folded into the IF
927 * instruction.
928 */
929 void
930 vec4_visitor::emit_if_gen6(ir_if *ir)
931 {
932 ir_expression *expr = ir->condition->as_expression();
933
934 if (expr && expr->operation != ir_binop_ubo_load) {
935 src_reg op[3];
936 dst_reg temp;
937
938 assert(expr->get_num_operands() <= 3);
939 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
940 expr->operands[i]->accept(this);
941 op[i] = this->result;
942 }
943
944 switch (expr->operation) {
945 case ir_unop_logic_not:
946 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
947 return;
948
949 case ir_binop_logic_xor:
950 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
951 return;
952
953 case ir_binop_logic_or:
954 temp = dst_reg(this, glsl_type::bool_type);
955 emit(OR(temp, op[0], op[1]));
956 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958
959 case ir_binop_logic_and:
960 temp = dst_reg(this, glsl_type::bool_type);
961 emit(AND(temp, op[0], op[1]));
962 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
963 return;
964
965 case ir_unop_f2b:
966 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_i2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_binop_greater:
974 case ir_binop_gequal:
975 case ir_binop_less:
976 case ir_binop_lequal:
977 case ir_binop_equal:
978 case ir_binop_nequal:
979 emit(IF(op[0], op[1],
980 brw_conditional_for_comparison(expr->operation)));
981 return;
982
983 case ir_binop_all_equal:
984 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
985 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
986 return;
987
988 case ir_binop_any_nequal:
989 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
990 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
991 return;
992
993 case ir_unop_any:
994 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
995 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
996 return;
997
998 case ir_triop_csel: {
999 /* Expand the boolean condition into the flag register. */
1000 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003 /* Select which boolean to return. */
1004 dst_reg temp(this, expr->operands[1]->type);
1005 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006 inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009 return;
1010 }
1011
1012 default:
1013 unreachable("not reached");
1014 }
1015 return;
1016 }
1017
1018 ir->condition->accept(this);
1019
1020 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026 dst_reg *reg = NULL;
1027
1028 if (variable_storage(ir))
1029 return;
1030
1031 switch (ir->data.mode) {
1032 case ir_var_shader_in:
1033 assert(ir->data.location != -1);
1034 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035 break;
1036
1037 case ir_var_shader_out:
1038 assert(ir->data.location != -1);
1039 reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041 for (int i = 0; i < type_size(ir->type); i++) {
1042 output_reg[ir->data.location + i] = *reg;
1043 output_reg[ir->data.location + i].reg_offset = i;
1044 output_reg[ir->data.location + i].type =
1045 brw_type_for_base_type(ir->type->get_scalar_type());
1046 output_reg_annotation[ir->data.location + i] = ir->name;
1047 }
1048 break;
1049
1050 case ir_var_auto:
1051 case ir_var_temporary:
1052 reg = new(mem_ctx) dst_reg(this, ir->type);
1053 break;
1054
1055 case ir_var_uniform:
1056 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058 /* Thanks to the lower_ubo_reference pass, we will see only
1059 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060 * variables, so no need for them to be in variable_ht.
1061 *
1062 * Some uniforms, such as samplers and atomic counters, have no actual
1063 * storage, so we should ignore them.
1064 */
1065 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066 return;
1067
1068 /* Track how big the whole uniform variable is, in case we need to put a
1069 * copy of its data into pull constants for array access.
1070 */
1071 assert(this->uniforms < uniform_array_size);
1072 this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074 if (!strncmp(ir->name, "gl_", 3)) {
1075 setup_builtin_uniform_values(ir);
1076 } else {
1077 setup_uniform_values(ir);
1078 }
1079 break;
1080
1081 case ir_var_system_value:
1082 reg = make_reg_for_system_value(ir);
1083 break;
1084
1085 default:
1086 unreachable("not reached");
1087 }
1088
1089 reg->type = brw_type_for_base_type(ir->type);
1090 hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096 /* We don't want debugging output to print the whole body of the
1097 * loop as the annotation.
1098 */
1099 this->base_ir = NULL;
1100
1101 emit(BRW_OPCODE_DO);
1102
1103 visit_instructions(&ir->body_instructions);
1104
1105 emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111 switch (ir->mode) {
1112 case ir_loop_jump::jump_break:
1113 emit(BRW_OPCODE_BREAK);
1114 break;
1115 case ir_loop_jump::jump_continue:
1116 emit(BRW_OPCODE_CONTINUE);
1117 break;
1118 }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125 unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131 /* Ignore function bodies other than main() -- we shouldn't see calls to
1132 * them since they should all be inlined.
1133 */
1134 if (strcmp(ir->name, "main") == 0) {
1135 const ir_function_signature *sig;
1136 exec_list empty;
1137
1138 sig = ir->matching_signature(NULL, &empty, false);
1139
1140 assert(sig);
1141
1142 visit_instructions(&sig->body);
1143 }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149 /* 3-src instructions were introduced in gen6. */
1150 if (devinfo->gen < 6)
1151 return false;
1152
1153 /* MAD can only handle floating-point data. */
1154 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155 return false;
1156
1157 ir_rvalue *nonmul;
1158 ir_expression *mul;
1159 bool mul_negate, mul_abs;
1160
1161 for (int i = 0; i < 2; i++) {
1162 mul_negate = false;
1163 mul_abs = false;
1164
1165 mul = ir->operands[i]->as_expression();
1166 nonmul = ir->operands[1 - i];
1167
1168 if (mul && mul->operation == ir_unop_abs) {
1169 mul = mul->operands[0]->as_expression();
1170 mul_abs = true;
1171 } else if (mul && mul->operation == ir_unop_neg) {
1172 mul = mul->operands[0]->as_expression();
1173 mul_negate = true;
1174 }
1175
1176 if (mul && mul->operation == ir_binop_mul)
1177 break;
1178 }
1179
1180 if (!mul || mul->operation != ir_binop_mul)
1181 return false;
1182
1183 nonmul->accept(this);
1184 src_reg src0 = fix_3src_operand(this->result);
1185
1186 mul->operands[0]->accept(this);
1187 src_reg src1 = fix_3src_operand(this->result);
1188 src1.negate ^= mul_negate;
1189 src1.abs = mul_abs;
1190 if (mul_abs)
1191 src1.negate = false;
1192
1193 mul->operands[1]->accept(this);
1194 src_reg src2 = fix_3src_operand(this->result);
1195 src2.abs = mul_abs;
1196 if (mul_abs)
1197 src2.negate = false;
1198
1199 this->result = src_reg(this, ir->type);
1200 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202 return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208 /* This optimization relies on CMP setting the destination to 0 when
1209 * false. Early hardware only sets the least significant bit, and
1210 * leaves the other bits undefined. So we can't use it.
1211 */
1212 if (devinfo->gen < 6)
1213 return false;
1214
1215 ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217 if (cmp == NULL)
1218 return false;
1219
1220 switch (cmp->operation) {
1221 case ir_binop_less:
1222 case ir_binop_greater:
1223 case ir_binop_lequal:
1224 case ir_binop_gequal:
1225 case ir_binop_equal:
1226 case ir_binop_nequal:
1227 break;
1228
1229 default:
1230 return false;
1231 }
1232
1233 cmp->operands[0]->accept(this);
1234 const src_reg cmp_src0 = this->result;
1235
1236 cmp->operands[1]->accept(this);
1237 const src_reg cmp_src1 = this->result;
1238
1239 this->result = src_reg(this, ir->type);
1240
1241 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242 brw_conditional_for_comparison(cmp->operation)));
1243
1244 /* If the comparison is false, this->result will just happen to be zero.
1245 */
1246 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247 this->result, src_reg(1.0f));
1248 inst->predicate = BRW_PREDICATE_NORMAL;
1249 inst->predicate_inverse = true;
1250
1251 return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256 src_reg src0, src_reg src1)
1257 {
1258 vec4_instruction *inst;
1259
1260 if (devinfo->gen >= 6) {
1261 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262 inst->conditional_mod = conditionalmod;
1263 } else {
1264 emit(CMP(dst, src0, src1, conditionalmod));
1265
1266 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267 inst->predicate = BRW_PREDICATE_NORMAL;
1268 }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273 const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275 if (devinfo->gen >= 6) {
1276 /* Note that the instruction's argument order is reversed from GLSL
1277 * and the IR.
1278 */
1279 emit(LRP(dst,
1280 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281 } else {
1282 /* Earlier generations don't support three source operations, so we
1283 * need to emit x*(1-a) + y*a.
1284 */
1285 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1286 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1287 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288 y_times_a.writemask = dst.writemask;
1289 one_minus_a.writemask = dst.writemask;
1290 x_times_one_minus_a.writemask = dst.writemask;
1291
1292 emit(MUL(y_times_a, y, a));
1293 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296 }
1297 }
1298
1299 /**
1300 * Emits the instructions needed to perform a pull constant load. before_block
1301 * and before_inst can be NULL in which case the instruction will be appended
1302 * to the end of the instruction list.
1303 */
1304 void
1305 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1306 src_reg surf_index,
1307 src_reg offset_reg,
1308 bblock_t *before_block,
1309 vec4_instruction *before_inst)
1310 {
1311 assert((before_inst == NULL && before_block == NULL) ||
1312 (before_inst && before_block));
1313
1314 vec4_instruction *pull;
1315
1316 if (devinfo->gen >= 9) {
1317 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1318 src_reg header(this, glsl_type::uvec4_type, 2);
1319
1320 pull = new(mem_ctx)
1321 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1322 dst_reg(header));
1323
1324 if (before_inst)
1325 emit_before(before_block, before_inst, pull);
1326 else
1327 emit(pull);
1328
1329 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1330 offset_reg.type);
1331 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1332
1333 if (before_inst)
1334 emit_before(before_block, before_inst, pull);
1335 else
1336 emit(pull);
1337
1338 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1339 dst,
1340 surf_index,
1341 header);
1342 pull->mlen = 2;
1343 pull->header_present = true;
1344 } else if (devinfo->gen >= 7) {
1345 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1346
1347 grf_offset.type = offset_reg.type;
1348
1349 pull = MOV(grf_offset, offset_reg);
1350
1351 if (before_inst)
1352 emit_before(before_block, before_inst, pull);
1353 else
1354 emit(pull);
1355
1356 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1357 dst,
1358 surf_index,
1359 src_reg(grf_offset));
1360 pull->mlen = 1;
1361 } else {
1362 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1363 dst,
1364 surf_index,
1365 offset_reg);
1366 pull->base_mrf = 14;
1367 pull->mlen = 1;
1368 }
1369
1370 if (before_inst)
1371 emit_before(before_block, before_inst, pull);
1372 else
1373 emit(pull);
1374 }
1375
1376 void
1377 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1378 {
1379 const src_reg chan_index(this, glsl_type::uint_type);
1380
1381 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1382 ->force_writemask_all = true;
1383 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1384 ->force_writemask_all = true;
1385 }
1386
1387 void
1388 vec4_visitor::visit(ir_expression *ir)
1389 {
1390 unsigned int operand;
1391 src_reg op[ARRAY_SIZE(ir->operands)];
1392 vec4_instruction *inst;
1393
1394 if (ir->operation == ir_binop_add) {
1395 if (try_emit_mad(ir))
1396 return;
1397 }
1398
1399 if (ir->operation == ir_unop_b2f) {
1400 if (try_emit_b2f_of_compare(ir))
1401 return;
1402 }
1403
1404 /* Storage for our result. Ideally for an assignment we'd be using
1405 * the actual storage for the result here, instead.
1406 */
1407 dst_reg result_dst(this, ir->type);
1408 src_reg result_src(result_dst);
1409
1410 if (ir->operation == ir_triop_csel) {
1411 ir->operands[1]->accept(this);
1412 op[1] = this->result;
1413 ir->operands[2]->accept(this);
1414 op[2] = this->result;
1415
1416 enum brw_predicate predicate;
1417 emit_bool_to_cond_code(ir->operands[0], &predicate);
1418 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1419 inst->predicate = predicate;
1420 this->result = result_src;
1421 return;
1422 }
1423
1424 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1425 this->result.file = BAD_FILE;
1426 ir->operands[operand]->accept(this);
1427 if (this->result.file == BAD_FILE) {
1428 fprintf(stderr, "Failed to get tree for expression operand:\n");
1429 ir->operands[operand]->fprint(stderr);
1430 exit(1);
1431 }
1432 op[operand] = this->result;
1433
1434 /* Matrix expression operands should have been broken down to vector
1435 * operations already.
1436 */
1437 assert(!ir->operands[operand]->type->is_matrix());
1438 }
1439
1440 /* If nothing special happens, this is the result. */
1441 this->result = result_src;
1442
1443 switch (ir->operation) {
1444 case ir_unop_logic_not:
1445 emit(NOT(result_dst, op[0]));
1446 break;
1447 case ir_unop_neg:
1448 op[0].negate = !op[0].negate;
1449 emit(MOV(result_dst, op[0]));
1450 break;
1451 case ir_unop_abs:
1452 op[0].abs = true;
1453 op[0].negate = false;
1454 emit(MOV(result_dst, op[0]));
1455 break;
1456
1457 case ir_unop_sign:
1458 if (ir->type->is_float()) {
1459 /* AND(val, 0x80000000) gives the sign bit.
1460 *
1461 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1462 * zero.
1463 */
1464 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1465
1466 op[0].type = BRW_REGISTER_TYPE_UD;
1467 result_dst.type = BRW_REGISTER_TYPE_UD;
1468 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1469
1470 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1471 inst->predicate = BRW_PREDICATE_NORMAL;
1472
1473 this->result.type = BRW_REGISTER_TYPE_F;
1474 } else {
1475 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1476 * -> non-negative val generates 0x00000000.
1477 * Predicated OR sets 1 if val is positive.
1478 */
1479 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1480
1481 emit(ASR(result_dst, op[0], src_reg(31)));
1482
1483 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1484 inst->predicate = BRW_PREDICATE_NORMAL;
1485 }
1486 break;
1487
1488 case ir_unop_rcp:
1489 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1490 break;
1491
1492 case ir_unop_exp2:
1493 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1494 break;
1495 case ir_unop_log2:
1496 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1497 break;
1498 case ir_unop_exp:
1499 case ir_unop_log:
1500 unreachable("not reached: should be handled by ir_explog_to_explog2");
1501 case ir_unop_sin:
1502 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1503 break;
1504 case ir_unop_cos:
1505 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1506 break;
1507
1508 case ir_unop_dFdx:
1509 case ir_unop_dFdx_coarse:
1510 case ir_unop_dFdx_fine:
1511 case ir_unop_dFdy:
1512 case ir_unop_dFdy_coarse:
1513 case ir_unop_dFdy_fine:
1514 unreachable("derivatives not valid in vertex shader");
1515
1516 case ir_unop_bitfield_reverse:
1517 emit(BFREV(result_dst, op[0]));
1518 break;
1519 case ir_unop_bit_count:
1520 emit(CBIT(result_dst, op[0]));
1521 break;
1522 case ir_unop_find_msb: {
1523 src_reg temp = src_reg(this, glsl_type::uint_type);
1524
1525 inst = emit(FBH(dst_reg(temp), op[0]));
1526 inst->dst.writemask = WRITEMASK_XYZW;
1527
1528 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1529 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1530 * subtract the result from 31 to convert the MSB count into an LSB count.
1531 */
1532
1533 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1534 temp.swizzle = BRW_SWIZZLE_NOOP;
1535 emit(MOV(result_dst, temp));
1536
1537 src_reg src_tmp = src_reg(result_dst);
1538 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1539
1540 src_tmp.negate = true;
1541 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1542 inst->predicate = BRW_PREDICATE_NORMAL;
1543 break;
1544 }
1545 case ir_unop_find_lsb:
1546 emit(FBL(result_dst, op[0]));
1547 break;
1548 case ir_unop_saturate:
1549 inst = emit(MOV(result_dst, op[0]));
1550 inst->saturate = true;
1551 break;
1552
1553 case ir_unop_noise:
1554 unreachable("not reached: should be handled by lower_noise");
1555
1556 case ir_binop_add:
1557 emit(ADD(result_dst, op[0], op[1]));
1558 break;
1559 case ir_binop_sub:
1560 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1561
1562 case ir_binop_mul:
1563 if (devinfo->gen < 8 && ir->type->is_integer()) {
1564 /* For integer multiplication, the MUL uses the low 16 bits of one of
1565 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1566 * accumulates in the contribution of the upper 16 bits of that
1567 * operand. If we can determine that one of the args is in the low
1568 * 16 bits, though, we can just emit a single MUL.
1569 */
1570 if (ir->operands[0]->is_uint16_constant()) {
1571 if (devinfo->gen < 7)
1572 emit(MUL(result_dst, op[0], op[1]));
1573 else
1574 emit(MUL(result_dst, op[1], op[0]));
1575 } else if (ir->operands[1]->is_uint16_constant()) {
1576 if (devinfo->gen < 7)
1577 emit(MUL(result_dst, op[1], op[0]));
1578 else
1579 emit(MUL(result_dst, op[0], op[1]));
1580 } else {
1581 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1582
1583 emit(MUL(acc, op[0], op[1]));
1584 emit(MACH(dst_null_d(), op[0], op[1]));
1585 emit(MOV(result_dst, src_reg(acc)));
1586 }
1587 } else {
1588 emit(MUL(result_dst, op[0], op[1]));
1589 }
1590 break;
1591 case ir_binop_imul_high: {
1592 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1593
1594 emit(MUL(acc, op[0], op[1]));
1595 emit(MACH(result_dst, op[0], op[1]));
1596 break;
1597 }
1598 case ir_binop_div:
1599 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1600 assert(ir->type->is_integer());
1601 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1602 break;
1603 case ir_binop_carry: {
1604 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1605
1606 emit(ADDC(dst_null_ud(), op[0], op[1]));
1607 emit(MOV(result_dst, src_reg(acc)));
1608 break;
1609 }
1610 case ir_binop_borrow: {
1611 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1612
1613 emit(SUBB(dst_null_ud(), op[0], op[1]));
1614 emit(MOV(result_dst, src_reg(acc)));
1615 break;
1616 }
1617 case ir_binop_mod:
1618 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1619 assert(ir->type->is_integer());
1620 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1621 break;
1622
1623 case ir_binop_less:
1624 case ir_binop_greater:
1625 case ir_binop_lequal:
1626 case ir_binop_gequal:
1627 case ir_binop_equal:
1628 case ir_binop_nequal: {
1629 if (devinfo->gen <= 5) {
1630 resolve_bool_comparison(ir->operands[0], &op[0]);
1631 resolve_bool_comparison(ir->operands[1], &op[1]);
1632 }
1633 emit(CMP(result_dst, op[0], op[1],
1634 brw_conditional_for_comparison(ir->operation)));
1635 break;
1636 }
1637
1638 case ir_binop_all_equal:
1639 if (devinfo->gen <= 5) {
1640 resolve_bool_comparison(ir->operands[0], &op[0]);
1641 resolve_bool_comparison(ir->operands[1], &op[1]);
1642 }
1643
1644 /* "==" operator producing a scalar boolean. */
1645 if (ir->operands[0]->type->is_vector() ||
1646 ir->operands[1]->type->is_vector()) {
1647 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1648 emit(MOV(result_dst, src_reg(0)));
1649 inst = emit(MOV(result_dst, src_reg(~0)));
1650 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1651 } else {
1652 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1653 }
1654 break;
1655 case ir_binop_any_nequal:
1656 if (devinfo->gen <= 5) {
1657 resolve_bool_comparison(ir->operands[0], &op[0]);
1658 resolve_bool_comparison(ir->operands[1], &op[1]);
1659 }
1660
1661 /* "!=" operator producing a scalar boolean. */
1662 if (ir->operands[0]->type->is_vector() ||
1663 ir->operands[1]->type->is_vector()) {
1664 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1665
1666 emit(MOV(result_dst, src_reg(0)));
1667 inst = emit(MOV(result_dst, src_reg(~0)));
1668 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1669 } else {
1670 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1671 }
1672 break;
1673
1674 case ir_unop_any:
1675 if (devinfo->gen <= 5) {
1676 resolve_bool_comparison(ir->operands[0], &op[0]);
1677 }
1678 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1679 emit(MOV(result_dst, src_reg(0)));
1680
1681 inst = emit(MOV(result_dst, src_reg(~0)));
1682 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1683 break;
1684
1685 case ir_binop_logic_xor:
1686 emit(XOR(result_dst, op[0], op[1]));
1687 break;
1688
1689 case ir_binop_logic_or:
1690 emit(OR(result_dst, op[0], op[1]));
1691 break;
1692
1693 case ir_binop_logic_and:
1694 emit(AND(result_dst, op[0], op[1]));
1695 break;
1696
1697 case ir_binop_dot:
1698 assert(ir->operands[0]->type->is_vector());
1699 assert(ir->operands[0]->type == ir->operands[1]->type);
1700 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1701 break;
1702
1703 case ir_unop_sqrt:
1704 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1705 break;
1706 case ir_unop_rsq:
1707 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1708 break;
1709
1710 case ir_unop_bitcast_i2f:
1711 case ir_unop_bitcast_u2f:
1712 this->result = op[0];
1713 this->result.type = BRW_REGISTER_TYPE_F;
1714 break;
1715
1716 case ir_unop_bitcast_f2i:
1717 this->result = op[0];
1718 this->result.type = BRW_REGISTER_TYPE_D;
1719 break;
1720
1721 case ir_unop_bitcast_f2u:
1722 this->result = op[0];
1723 this->result.type = BRW_REGISTER_TYPE_UD;
1724 break;
1725
1726 case ir_unop_i2f:
1727 case ir_unop_i2u:
1728 case ir_unop_u2i:
1729 case ir_unop_u2f:
1730 case ir_unop_f2i:
1731 case ir_unop_f2u:
1732 emit(MOV(result_dst, op[0]));
1733 break;
1734 case ir_unop_b2i:
1735 emit(AND(result_dst, op[0], src_reg(1)));
1736 break;
1737 case ir_unop_b2f:
1738 if (devinfo->gen <= 5) {
1739 resolve_bool_comparison(ir->operands[0], &op[0]);
1740 }
1741 op[0].type = BRW_REGISTER_TYPE_D;
1742 result_dst.type = BRW_REGISTER_TYPE_D;
1743 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1744 result_dst.type = BRW_REGISTER_TYPE_F;
1745 break;
1746 case ir_unop_f2b:
1747 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1748 break;
1749 case ir_unop_i2b:
1750 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1751 break;
1752
1753 case ir_unop_trunc:
1754 emit(RNDZ(result_dst, op[0]));
1755 break;
1756 case ir_unop_ceil: {
1757 src_reg tmp = src_reg(this, ir->type);
1758 op[0].negate = !op[0].negate;
1759 emit(RNDD(dst_reg(tmp), op[0]));
1760 tmp.negate = true;
1761 emit(MOV(result_dst, tmp));
1762 }
1763 break;
1764 case ir_unop_floor:
1765 inst = emit(RNDD(result_dst, op[0]));
1766 break;
1767 case ir_unop_fract:
1768 inst = emit(FRC(result_dst, op[0]));
1769 break;
1770 case ir_unop_round_even:
1771 emit(RNDE(result_dst, op[0]));
1772 break;
1773
1774 case ir_binop_min:
1775 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1776 break;
1777 case ir_binop_max:
1778 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1779 break;
1780
1781 case ir_binop_pow:
1782 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1783 break;
1784
1785 case ir_unop_bit_not:
1786 inst = emit(NOT(result_dst, op[0]));
1787 break;
1788 case ir_binop_bit_and:
1789 inst = emit(AND(result_dst, op[0], op[1]));
1790 break;
1791 case ir_binop_bit_xor:
1792 inst = emit(XOR(result_dst, op[0], op[1]));
1793 break;
1794 case ir_binop_bit_or:
1795 inst = emit(OR(result_dst, op[0], op[1]));
1796 break;
1797
1798 case ir_binop_lshift:
1799 inst = emit(SHL(result_dst, op[0], op[1]));
1800 break;
1801
1802 case ir_binop_rshift:
1803 if (ir->type->base_type == GLSL_TYPE_INT)
1804 inst = emit(ASR(result_dst, op[0], op[1]));
1805 else
1806 inst = emit(SHR(result_dst, op[0], op[1]));
1807 break;
1808
1809 case ir_binop_bfm:
1810 emit(BFI1(result_dst, op[0], op[1]));
1811 break;
1812
1813 case ir_binop_ubo_load: {
1814 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1815 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1816 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1817 src_reg offset;
1818
1819 /* Now, load the vector from that offset. */
1820 assert(ir->type->is_vector() || ir->type->is_scalar());
1821
1822 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1823 packed_consts.type = result.type;
1824 src_reg surf_index;
1825
1826 if (const_uniform_block) {
1827 /* The block index is a constant, so just emit the binding table entry
1828 * as an immediate.
1829 */
1830 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1831 const_uniform_block->value.u[0]);
1832 } else {
1833 /* The block index is not a constant. Evaluate the index expression
1834 * per-channel and add the base UBO index; we have to select a value
1835 * from any live channel.
1836 */
1837 surf_index = src_reg(this, glsl_type::uint_type);
1838 emit(ADD(dst_reg(surf_index), op[0],
1839 src_reg(prog_data->base.binding_table.ubo_start)));
1840 emit_uniformize(dst_reg(surf_index), surf_index);
1841
1842 /* Assume this may touch any UBO. It would be nice to provide
1843 * a tighter bound, but the array information is already lowered away.
1844 */
1845 brw_mark_surface_used(&prog_data->base,
1846 prog_data->base.binding_table.ubo_start +
1847 shader_prog->NumUniformBlocks - 1);
1848 }
1849
1850 if (const_offset_ir) {
1851 if (devinfo->gen >= 8) {
1852 /* Store the offset in a GRF so we can send-from-GRF. */
1853 offset = src_reg(this, glsl_type::int_type);
1854 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1855 } else {
1856 /* Immediates are fine on older generations since they'll be moved
1857 * to a (potentially fake) MRF at the generator level.
1858 */
1859 offset = src_reg(const_offset / 16);
1860 }
1861 } else {
1862 offset = src_reg(this, glsl_type::uint_type);
1863 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1864 }
1865
1866 emit_pull_constant_load_reg(dst_reg(packed_consts),
1867 surf_index,
1868 offset,
1869 NULL, NULL /* before_block/inst */);
1870
1871 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1872 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1873 const_offset % 16 / 4,
1874 const_offset % 16 / 4,
1875 const_offset % 16 / 4);
1876
1877 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1878 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1879 emit(CMP(result_dst, packed_consts, src_reg(0u),
1880 BRW_CONDITIONAL_NZ));
1881 } else {
1882 emit(MOV(result_dst, packed_consts));
1883 }
1884 break;
1885 }
1886
1887 case ir_binop_vector_extract:
1888 unreachable("should have been lowered by vec_index_to_cond_assign");
1889
1890 case ir_triop_fma:
1891 op[0] = fix_3src_operand(op[0]);
1892 op[1] = fix_3src_operand(op[1]);
1893 op[2] = fix_3src_operand(op[2]);
1894 /* Note that the instruction's argument order is reversed from GLSL
1895 * and the IR.
1896 */
1897 emit(MAD(result_dst, op[2], op[1], op[0]));
1898 break;
1899
1900 case ir_triop_lrp:
1901 emit_lrp(result_dst, op[0], op[1], op[2]);
1902 break;
1903
1904 case ir_triop_csel:
1905 unreachable("already handled above");
1906 break;
1907
1908 case ir_triop_bfi:
1909 op[0] = fix_3src_operand(op[0]);
1910 op[1] = fix_3src_operand(op[1]);
1911 op[2] = fix_3src_operand(op[2]);
1912 emit(BFI2(result_dst, op[0], op[1], op[2]));
1913 break;
1914
1915 case ir_triop_bitfield_extract:
1916 op[0] = fix_3src_operand(op[0]);
1917 op[1] = fix_3src_operand(op[1]);
1918 op[2] = fix_3src_operand(op[2]);
1919 /* Note that the instruction's argument order is reversed from GLSL
1920 * and the IR.
1921 */
1922 emit(BFE(result_dst, op[2], op[1], op[0]));
1923 break;
1924
1925 case ir_triop_vector_insert:
1926 unreachable("should have been lowered by lower_vector_insert");
1927
1928 case ir_quadop_bitfield_insert:
1929 unreachable("not reached: should be handled by "
1930 "bitfield_insert_to_bfm_bfi\n");
1931
1932 case ir_quadop_vector:
1933 unreachable("not reached: should be handled by lower_quadop_vector");
1934
1935 case ir_unop_pack_half_2x16:
1936 emit_pack_half_2x16(result_dst, op[0]);
1937 break;
1938 case ir_unop_unpack_half_2x16:
1939 emit_unpack_half_2x16(result_dst, op[0]);
1940 break;
1941 case ir_unop_unpack_unorm_4x8:
1942 emit_unpack_unorm_4x8(result_dst, op[0]);
1943 break;
1944 case ir_unop_unpack_snorm_4x8:
1945 emit_unpack_snorm_4x8(result_dst, op[0]);
1946 break;
1947 case ir_unop_pack_unorm_4x8:
1948 emit_pack_unorm_4x8(result_dst, op[0]);
1949 break;
1950 case ir_unop_pack_snorm_4x8:
1951 emit_pack_snorm_4x8(result_dst, op[0]);
1952 break;
1953 case ir_unop_pack_snorm_2x16:
1954 case ir_unop_pack_unorm_2x16:
1955 case ir_unop_unpack_snorm_2x16:
1956 case ir_unop_unpack_unorm_2x16:
1957 unreachable("not reached: should be handled by lower_packing_builtins");
1958 case ir_unop_unpack_half_2x16_split_x:
1959 case ir_unop_unpack_half_2x16_split_y:
1960 case ir_binop_pack_half_2x16_split:
1961 case ir_unop_interpolate_at_centroid:
1962 case ir_binop_interpolate_at_sample:
1963 case ir_binop_interpolate_at_offset:
1964 unreachable("not reached: should not occur in vertex shader");
1965 case ir_binop_ldexp:
1966 unreachable("not reached: should be handled by ldexp_to_arith()");
1967 case ir_unop_d2f:
1968 case ir_unop_f2d:
1969 case ir_unop_d2i:
1970 case ir_unop_i2d:
1971 case ir_unop_d2u:
1972 case ir_unop_u2d:
1973 case ir_unop_d2b:
1974 case ir_unop_pack_double_2x32:
1975 case ir_unop_unpack_double_2x32:
1976 case ir_unop_frexp_sig:
1977 case ir_unop_frexp_exp:
1978 unreachable("fp64 todo");
1979 }
1980 }
1981
1982
1983 void
1984 vec4_visitor::visit(ir_swizzle *ir)
1985 {
1986 /* Note that this is only swizzles in expressions, not those on the left
1987 * hand side of an assignment, which do write masking. See ir_assignment
1988 * for that.
1989 */
1990 const unsigned swz = brw_compose_swizzle(
1991 brw_swizzle_for_size(ir->type->vector_elements),
1992 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1993
1994 ir->val->accept(this);
1995 this->result = swizzle(this->result, swz);
1996 }
1997
1998 void
1999 vec4_visitor::visit(ir_dereference_variable *ir)
2000 {
2001 const struct glsl_type *type = ir->type;
2002 dst_reg *reg = variable_storage(ir->var);
2003
2004 if (!reg) {
2005 fail("Failed to find variable storage for %s\n", ir->var->name);
2006 this->result = src_reg(brw_null_reg());
2007 return;
2008 }
2009
2010 this->result = src_reg(*reg);
2011
2012 /* System values get their swizzle from the dst_reg writemask */
2013 if (ir->var->data.mode == ir_var_system_value)
2014 return;
2015
2016 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2017 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2018 }
2019
2020
2021 int
2022 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2023 {
2024 /* Under normal circumstances array elements are stored consecutively, so
2025 * the stride is equal to the size of the array element.
2026 */
2027 return type_size(ir->type);
2028 }
2029
2030
2031 void
2032 vec4_visitor::visit(ir_dereference_array *ir)
2033 {
2034 ir_constant *constant_index;
2035 src_reg src;
2036 int array_stride = compute_array_stride(ir);
2037
2038 constant_index = ir->array_index->constant_expression_value();
2039
2040 ir->array->accept(this);
2041 src = this->result;
2042
2043 if (constant_index) {
2044 src.reg_offset += constant_index->value.i[0] * array_stride;
2045 } else {
2046 /* Variable index array dereference. It eats the "vec4" of the
2047 * base of the array and an index that offsets the Mesa register
2048 * index.
2049 */
2050 ir->array_index->accept(this);
2051
2052 src_reg index_reg;
2053
2054 if (array_stride == 1) {
2055 index_reg = this->result;
2056 } else {
2057 index_reg = src_reg(this, glsl_type::int_type);
2058
2059 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2060 }
2061
2062 if (src.reladdr) {
2063 src_reg temp = src_reg(this, glsl_type::int_type);
2064
2065 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2066
2067 index_reg = temp;
2068 }
2069
2070 src.reladdr = ralloc(mem_ctx, src_reg);
2071 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2072 }
2073
2074 /* If the type is smaller than a vec4, replicate the last channel out. */
2075 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2076 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2077 else
2078 src.swizzle = BRW_SWIZZLE_NOOP;
2079 src.type = brw_type_for_base_type(ir->type);
2080
2081 this->result = src;
2082 }
2083
2084 void
2085 vec4_visitor::visit(ir_dereference_record *ir)
2086 {
2087 unsigned int i;
2088 const glsl_type *struct_type = ir->record->type;
2089 int offset = 0;
2090
2091 ir->record->accept(this);
2092
2093 for (i = 0; i < struct_type->length; i++) {
2094 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2095 break;
2096 offset += type_size(struct_type->fields.structure[i].type);
2097 }
2098
2099 /* If the type is smaller than a vec4, replicate the last channel out. */
2100 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2101 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2102 else
2103 this->result.swizzle = BRW_SWIZZLE_NOOP;
2104 this->result.type = brw_type_for_base_type(ir->type);
2105
2106 this->result.reg_offset += offset;
2107 }
2108
2109 /**
2110 * We want to be careful in assignment setup to hit the actual storage
2111 * instead of potentially using a temporary like we might with the
2112 * ir_dereference handler.
2113 */
2114 static dst_reg
2115 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2116 {
2117 /* The LHS must be a dereference. If the LHS is a variable indexed array
2118 * access of a vector, it must be separated into a series conditional moves
2119 * before reaching this point (see ir_vec_index_to_cond_assign).
2120 */
2121 assert(ir->as_dereference());
2122 ir_dereference_array *deref_array = ir->as_dereference_array();
2123 if (deref_array) {
2124 assert(!deref_array->array->type->is_vector());
2125 }
2126
2127 /* Use the rvalue deref handler for the most part. We'll ignore
2128 * swizzles in it and write swizzles using writemask, though.
2129 */
2130 ir->accept(v);
2131 return dst_reg(v->result);
2132 }
2133
2134 void
2135 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2136 const struct glsl_type *type,
2137 enum brw_predicate predicate)
2138 {
2139 if (type->base_type == GLSL_TYPE_STRUCT) {
2140 for (unsigned int i = 0; i < type->length; i++) {
2141 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2142 }
2143 return;
2144 }
2145
2146 if (type->is_array()) {
2147 for (unsigned int i = 0; i < type->length; i++) {
2148 emit_block_move(dst, src, type->fields.array, predicate);
2149 }
2150 return;
2151 }
2152
2153 if (type->is_matrix()) {
2154 const struct glsl_type *vec_type;
2155
2156 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2157 type->vector_elements, 1);
2158
2159 for (int i = 0; i < type->matrix_columns; i++) {
2160 emit_block_move(dst, src, vec_type, predicate);
2161 }
2162 return;
2163 }
2164
2165 assert(type->is_scalar() || type->is_vector());
2166
2167 dst->type = brw_type_for_base_type(type);
2168 src->type = dst->type;
2169
2170 dst->writemask = (1 << type->vector_elements) - 1;
2171
2172 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2173
2174 vec4_instruction *inst = emit(MOV(*dst, *src));
2175 inst->predicate = predicate;
2176
2177 dst->reg_offset++;
2178 src->reg_offset++;
2179 }
2180
2181
2182 /* If the RHS processing resulted in an instruction generating a
2183 * temporary value, and it would be easy to rewrite the instruction to
2184 * generate its result right into the LHS instead, do so. This ends
2185 * up reliably removing instructions where it can be tricky to do so
2186 * later without real UD chain information.
2187 */
2188 bool
2189 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2190 dst_reg dst,
2191 src_reg src,
2192 vec4_instruction *pre_rhs_inst,
2193 vec4_instruction *last_rhs_inst)
2194 {
2195 /* This could be supported, but it would take more smarts. */
2196 if (ir->condition)
2197 return false;
2198
2199 if (pre_rhs_inst == last_rhs_inst)
2200 return false; /* No instructions generated to work with. */
2201
2202 /* Make sure the last instruction generated our source reg. */
2203 if (src.file != GRF ||
2204 src.file != last_rhs_inst->dst.file ||
2205 src.reg != last_rhs_inst->dst.reg ||
2206 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2207 src.reladdr ||
2208 src.abs ||
2209 src.negate ||
2210 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2211 return false;
2212
2213 /* Check that that last instruction fully initialized the channels
2214 * we want to use, in the order we want to use them. We could
2215 * potentially reswizzle the operands of many instructions so that
2216 * we could handle out of order channels, but don't yet.
2217 */
2218
2219 for (unsigned i = 0; i < 4; i++) {
2220 if (dst.writemask & (1 << i)) {
2221 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2222 return false;
2223
2224 if (BRW_GET_SWZ(src.swizzle, i) != i)
2225 return false;
2226 }
2227 }
2228
2229 /* Success! Rewrite the instruction. */
2230 last_rhs_inst->dst.file = dst.file;
2231 last_rhs_inst->dst.reg = dst.reg;
2232 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2233 last_rhs_inst->dst.reladdr = dst.reladdr;
2234 last_rhs_inst->dst.writemask &= dst.writemask;
2235
2236 return true;
2237 }
2238
2239 void
2240 vec4_visitor::visit(ir_assignment *ir)
2241 {
2242 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2243 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2244
2245 if (!ir->lhs->type->is_scalar() &&
2246 !ir->lhs->type->is_vector()) {
2247 ir->rhs->accept(this);
2248 src_reg src = this->result;
2249
2250 if (ir->condition) {
2251 emit_bool_to_cond_code(ir->condition, &predicate);
2252 }
2253
2254 /* emit_block_move doesn't account for swizzles in the source register.
2255 * This should be ok, since the source register is a structure or an
2256 * array, and those can't be swizzled. But double-check to be sure.
2257 */
2258 assert(src.swizzle ==
2259 (ir->rhs->type->is_matrix()
2260 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2261 : BRW_SWIZZLE_NOOP));
2262
2263 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2264 return;
2265 }
2266
2267 /* Now we're down to just a scalar/vector with writemasks. */
2268 int i;
2269
2270 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2271 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2272
2273 ir->rhs->accept(this);
2274
2275 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2276
2277 int swizzles[4];
2278 int src_chan = 0;
2279
2280 assert(ir->lhs->type->is_vector() ||
2281 ir->lhs->type->is_scalar());
2282 dst.writemask = ir->write_mask;
2283
2284 /* Swizzle a small RHS vector into the channels being written.
2285 *
2286 * glsl ir treats write_mask as dictating how many channels are
2287 * present on the RHS while in our instructions we need to make
2288 * those channels appear in the slots of the vec4 they're written to.
2289 */
2290 for (int i = 0; i < 4; i++)
2291 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2292
2293 src_reg src = swizzle(this->result,
2294 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2295 swizzles[2], swizzles[3]));
2296
2297 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2298 return;
2299 }
2300
2301 if (ir->condition) {
2302 emit_bool_to_cond_code(ir->condition, &predicate);
2303 }
2304
2305 for (i = 0; i < type_size(ir->lhs->type); i++) {
2306 vec4_instruction *inst = emit(MOV(dst, src));
2307 inst->predicate = predicate;
2308
2309 dst.reg_offset++;
2310 src.reg_offset++;
2311 }
2312 }
2313
2314 void
2315 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2316 {
2317 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2318 foreach_in_list(ir_constant, field_value, &ir->components) {
2319 emit_constant_values(dst, field_value);
2320 }
2321 return;
2322 }
2323
2324 if (ir->type->is_array()) {
2325 for (unsigned int i = 0; i < ir->type->length; i++) {
2326 emit_constant_values(dst, ir->array_elements[i]);
2327 }
2328 return;
2329 }
2330
2331 if (ir->type->is_matrix()) {
2332 for (int i = 0; i < ir->type->matrix_columns; i++) {
2333 float *vec = &ir->value.f[i * ir->type->vector_elements];
2334
2335 for (int j = 0; j < ir->type->vector_elements; j++) {
2336 dst->writemask = 1 << j;
2337 dst->type = BRW_REGISTER_TYPE_F;
2338
2339 emit(MOV(*dst, src_reg(vec[j])));
2340 }
2341 dst->reg_offset++;
2342 }
2343 return;
2344 }
2345
2346 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2347
2348 for (int i = 0; i < ir->type->vector_elements; i++) {
2349 if (!(remaining_writemask & (1 << i)))
2350 continue;
2351
2352 dst->writemask = 1 << i;
2353 dst->type = brw_type_for_base_type(ir->type);
2354
2355 /* Find other components that match the one we're about to
2356 * write. Emits fewer instructions for things like vec4(0.5,
2357 * 1.5, 1.5, 1.5).
2358 */
2359 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2360 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2361 if (ir->value.b[i] == ir->value.b[j])
2362 dst->writemask |= (1 << j);
2363 } else {
2364 /* u, i, and f storage all line up, so no need for a
2365 * switch case for comparing each type.
2366 */
2367 if (ir->value.u[i] == ir->value.u[j])
2368 dst->writemask |= (1 << j);
2369 }
2370 }
2371
2372 switch (ir->type->base_type) {
2373 case GLSL_TYPE_FLOAT:
2374 emit(MOV(*dst, src_reg(ir->value.f[i])));
2375 break;
2376 case GLSL_TYPE_INT:
2377 emit(MOV(*dst, src_reg(ir->value.i[i])));
2378 break;
2379 case GLSL_TYPE_UINT:
2380 emit(MOV(*dst, src_reg(ir->value.u[i])));
2381 break;
2382 case GLSL_TYPE_BOOL:
2383 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2384 break;
2385 default:
2386 unreachable("Non-float/uint/int/bool constant");
2387 }
2388
2389 remaining_writemask &= ~dst->writemask;
2390 }
2391 dst->reg_offset++;
2392 }
2393
2394 void
2395 vec4_visitor::visit(ir_constant *ir)
2396 {
2397 dst_reg dst = dst_reg(this, ir->type);
2398 this->result = src_reg(dst);
2399
2400 emit_constant_values(&dst, ir);
2401 }
2402
2403 void
2404 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2405 {
2406 ir_dereference *deref = static_cast<ir_dereference *>(
2407 ir->actual_parameters.get_head());
2408 ir_variable *location = deref->variable_referenced();
2409 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2410 location->data.binding);
2411
2412 /* Calculate the surface offset */
2413 src_reg offset(this, glsl_type::uint_type);
2414 ir_dereference_array *deref_array = deref->as_dereference_array();
2415 if (deref_array) {
2416 deref_array->array_index->accept(this);
2417
2418 src_reg tmp(this, glsl_type::uint_type);
2419 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2420 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2421 } else {
2422 offset = location->data.atomic.offset;
2423 }
2424
2425 /* Emit the appropriate machine instruction */
2426 const char *callee = ir->callee->function_name();
2427 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2428
2429 if (!strcmp("__intrinsic_atomic_read", callee)) {
2430 emit_untyped_surface_read(surf_index, dst, offset);
2431
2432 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2433 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2434 src_reg(), src_reg());
2435
2436 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2437 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2438 src_reg(), src_reg());
2439 }
2440 }
2441
2442 void
2443 vec4_visitor::visit(ir_call *ir)
2444 {
2445 const char *callee = ir->callee->function_name();
2446
2447 if (!strcmp("__intrinsic_atomic_read", callee) ||
2448 !strcmp("__intrinsic_atomic_increment", callee) ||
2449 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2450 visit_atomic_counter_intrinsic(ir);
2451 } else {
2452 unreachable("Unsupported intrinsic.");
2453 }
2454 }
2455
2456 src_reg
2457 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2458 {
2459 vec4_instruction *inst =
2460 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2461 dst_reg(this, glsl_type::uvec4_type));
2462 inst->base_mrf = 2;
2463 inst->mlen = 1;
2464 inst->src[1] = sampler;
2465
2466 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2467 int param_base = inst->base_mrf;
2468 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2469 int zero_mask = 0xf & ~coord_mask;
2470
2471 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2472 coordinate));
2473
2474 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2475 src_reg(0)));
2476
2477 emit(inst);
2478 return src_reg(inst->dst);
2479 }
2480
2481 static bool
2482 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2483 {
2484 if (devinfo->gen < 8 && !devinfo->is_haswell)
2485 return false;
2486
2487 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2488 }
2489
2490 void
2491 vec4_visitor::visit(ir_texture *ir)
2492 {
2493 uint32_t sampler =
2494 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2495
2496 ir_rvalue *nonconst_sampler_index =
2497 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2498
2499 /* Handle non-constant sampler array indexing */
2500 src_reg sampler_reg;
2501 if (nonconst_sampler_index) {
2502 /* The highest sampler which may be used by this operation is
2503 * the last element of the array. Mark it here, because the generator
2504 * doesn't have enough information to determine the bound.
2505 */
2506 uint32_t array_size = ir->sampler->as_dereference_array()
2507 ->array->type->array_size();
2508
2509 uint32_t max_used = sampler + array_size - 1;
2510 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2511 max_used += prog_data->base.binding_table.gather_texture_start;
2512 } else {
2513 max_used += prog_data->base.binding_table.texture_start;
2514 }
2515
2516 brw_mark_surface_used(&prog_data->base, max_used);
2517
2518 /* Emit code to evaluate the actual indexing expression */
2519 nonconst_sampler_index->accept(this);
2520 dst_reg temp(this, glsl_type::uint_type);
2521 emit(ADD(temp, this->result, src_reg(sampler)));
2522 emit_uniformize(temp, src_reg(temp));
2523
2524 sampler_reg = src_reg(temp);
2525 } else {
2526 /* Single sampler, or constant array index; the indexing expression
2527 * is just an immediate.
2528 */
2529 sampler_reg = src_reg(sampler);
2530 }
2531
2532 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2533 * emitting anything other than setting up the constant result.
2534 */
2535 if (ir->op == ir_tg4) {
2536 ir_constant *chan = ir->lod_info.component->as_constant();
2537 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2538 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2539 dst_reg result(this, ir->type);
2540 this->result = src_reg(result);
2541 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2542 return;
2543 }
2544 }
2545
2546 /* Should be lowered by do_lower_texture_projection */
2547 assert(!ir->projector);
2548
2549 /* Should be lowered */
2550 assert(!ir->offset || !ir->offset->type->is_array());
2551
2552 /* Generate code to compute all the subexpression trees. This has to be
2553 * done before loading any values into MRFs for the sampler message since
2554 * generating these values may involve SEND messages that need the MRFs.
2555 */
2556 src_reg coordinate;
2557 if (ir->coordinate) {
2558 ir->coordinate->accept(this);
2559 coordinate = this->result;
2560 }
2561
2562 src_reg shadow_comparitor;
2563 if (ir->shadow_comparitor) {
2564 ir->shadow_comparitor->accept(this);
2565 shadow_comparitor = this->result;
2566 }
2567
2568 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2569 src_reg offset_value;
2570 if (has_nonconstant_offset) {
2571 ir->offset->accept(this);
2572 offset_value = src_reg(this->result);
2573 }
2574
2575 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2576 src_reg lod, dPdx, dPdy, sample_index, mcs;
2577 switch (ir->op) {
2578 case ir_tex:
2579 lod = src_reg(0.0f);
2580 lod_type = glsl_type::float_type;
2581 break;
2582 case ir_txf:
2583 case ir_txl:
2584 case ir_txs:
2585 ir->lod_info.lod->accept(this);
2586 lod = this->result;
2587 lod_type = ir->lod_info.lod->type;
2588 break;
2589 case ir_query_levels:
2590 lod = src_reg(0);
2591 lod_type = glsl_type::int_type;
2592 break;
2593 case ir_txf_ms:
2594 ir->lod_info.sample_index->accept(this);
2595 sample_index = this->result;
2596 sample_index_type = ir->lod_info.sample_index->type;
2597
2598 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2599 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2600 else
2601 mcs = src_reg(0u);
2602 break;
2603 case ir_txd:
2604 ir->lod_info.grad.dPdx->accept(this);
2605 dPdx = this->result;
2606
2607 ir->lod_info.grad.dPdy->accept(this);
2608 dPdy = this->result;
2609
2610 lod_type = ir->lod_info.grad.dPdx->type;
2611 break;
2612 case ir_txb:
2613 case ir_lod:
2614 case ir_tg4:
2615 break;
2616 }
2617
2618 enum opcode opcode;
2619 switch (ir->op) {
2620 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2621 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2622 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2623 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2624 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2625 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2626 case ir_tg4: opcode = has_nonconstant_offset
2627 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2628 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2629 case ir_txb:
2630 unreachable("TXB is not valid for vertex shaders.");
2631 case ir_lod:
2632 unreachable("LOD is not valid for vertex shaders.");
2633 default:
2634 unreachable("Unrecognized tex op");
2635 }
2636
2637 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2638 opcode, dst_reg(this, ir->type));
2639
2640 if (ir->offset != NULL && !has_nonconstant_offset) {
2641 inst->offset =
2642 brw_texture_offset(ir->offset->as_constant()->value.i,
2643 ir->offset->type->vector_elements);
2644 }
2645
2646 /* Stuff the channel select bits in the top of the texture offset */
2647 if (ir->op == ir_tg4)
2648 inst->offset |= gather_channel(ir, sampler) << 16;
2649
2650 /* The message header is necessary for:
2651 * - Gen4 (always)
2652 * - Gen9+ for selecting SIMD4x2
2653 * - Texel offsets
2654 * - Gather channel selection
2655 * - Sampler indices too large to fit in a 4-bit value.
2656 */
2657 inst->header_present =
2658 devinfo->gen < 5 || devinfo->gen >= 9 ||
2659 inst->offset != 0 || ir->op == ir_tg4 ||
2660 is_high_sampler(devinfo, sampler_reg);
2661 inst->base_mrf = 2;
2662 inst->mlen = inst->header_present + 1; /* always at least one */
2663 inst->dst.writemask = WRITEMASK_XYZW;
2664 inst->shadow_compare = ir->shadow_comparitor != NULL;
2665
2666 inst->src[1] = sampler_reg;
2667
2668 /* MRF for the first parameter */
2669 int param_base = inst->base_mrf + inst->header_present;
2670
2671 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2672 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2673 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2674 } else {
2675 /* Load the coordinate */
2676 /* FINISHME: gl_clamp_mask and saturate */
2677 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2678 int zero_mask = 0xf & ~coord_mask;
2679
2680 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2681 coordinate));
2682
2683 if (zero_mask != 0) {
2684 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2685 src_reg(0)));
2686 }
2687 /* Load the shadow comparitor */
2688 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2689 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2690 WRITEMASK_X),
2691 shadow_comparitor));
2692 inst->mlen++;
2693 }
2694
2695 /* Load the LOD info */
2696 if (ir->op == ir_tex || ir->op == ir_txl) {
2697 int mrf, writemask;
2698 if (devinfo->gen >= 5) {
2699 mrf = param_base + 1;
2700 if (ir->shadow_comparitor) {
2701 writemask = WRITEMASK_Y;
2702 /* mlen already incremented */
2703 } else {
2704 writemask = WRITEMASK_X;
2705 inst->mlen++;
2706 }
2707 } else /* devinfo->gen == 4 */ {
2708 mrf = param_base;
2709 writemask = WRITEMASK_W;
2710 }
2711 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2712 } else if (ir->op == ir_txf) {
2713 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2714 } else if (ir->op == ir_txf_ms) {
2715 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2716 sample_index));
2717 if (devinfo->gen >= 7) {
2718 /* MCS data is in the first channel of `mcs`, but we need to get it into
2719 * the .y channel of the second vec4 of params, so replicate .x across
2720 * the whole vec4 and then mask off everything except .y
2721 */
2722 mcs.swizzle = BRW_SWIZZLE_XXXX;
2723 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2724 mcs));
2725 }
2726 inst->mlen++;
2727 } else if (ir->op == ir_txd) {
2728 const glsl_type *type = lod_type;
2729
2730 if (devinfo->gen >= 5) {
2731 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2732 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2733 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2734 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2735 inst->mlen++;
2736
2737 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2738 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2739 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2740 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2741 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2742 inst->mlen++;
2743
2744 if (ir->shadow_comparitor) {
2745 emit(MOV(dst_reg(MRF, param_base + 2,
2746 ir->shadow_comparitor->type, WRITEMASK_Z),
2747 shadow_comparitor));
2748 }
2749 }
2750 } else /* devinfo->gen == 4 */ {
2751 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2752 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2753 inst->mlen += 2;
2754 }
2755 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2756 if (ir->shadow_comparitor) {
2757 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2758 shadow_comparitor));
2759 }
2760
2761 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2762 offset_value));
2763 inst->mlen++;
2764 }
2765 }
2766
2767 emit(inst);
2768
2769 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2770 * spec requires layers.
2771 */
2772 if (ir->op == ir_txs) {
2773 glsl_type const *type = ir->sampler->type;
2774 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2775 type->sampler_array) {
2776 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2777 writemask(inst->dst, WRITEMASK_Z),
2778 src_reg(inst->dst), src_reg(6));
2779 }
2780 }
2781
2782 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2783 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2784 }
2785
2786 swizzle_result(ir, src_reg(inst->dst), sampler);
2787 }
2788
2789 /**
2790 * Apply workarounds for Gen6 gather with UINT/SINT
2791 */
2792 void
2793 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2794 {
2795 if (!wa)
2796 return;
2797
2798 int width = (wa & WA_8BIT) ? 8 : 16;
2799 dst_reg dst_f = dst;
2800 dst_f.type = BRW_REGISTER_TYPE_F;
2801
2802 /* Convert from UNORM to UINT */
2803 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2804 emit(MOV(dst, src_reg(dst_f)));
2805
2806 if (wa & WA_SIGN) {
2807 /* Reinterpret the UINT value as a signed INT value by
2808 * shifting the sign bit into place, then shifting back
2809 * preserving sign.
2810 */
2811 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2812 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2813 }
2814 }
2815
2816 /**
2817 * Set up the gather channel based on the swizzle, for gather4.
2818 */
2819 uint32_t
2820 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2821 {
2822 ir_constant *chan = ir->lod_info.component->as_constant();
2823 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2824 switch (swiz) {
2825 case SWIZZLE_X: return 0;
2826 case SWIZZLE_Y:
2827 /* gather4 sampler is broken for green channel on RG32F --
2828 * we must ask for blue instead.
2829 */
2830 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2831 return 2;
2832 return 1;
2833 case SWIZZLE_Z: return 2;
2834 case SWIZZLE_W: return 3;
2835 default:
2836 unreachable("Not reached"); /* zero, one swizzles handled already */
2837 }
2838 }
2839
2840 void
2841 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2842 {
2843 int s = key->tex.swizzles[sampler];
2844
2845 this->result = src_reg(this, ir->type);
2846 dst_reg swizzled_result(this->result);
2847
2848 if (ir->op == ir_query_levels) {
2849 /* # levels is in .w */
2850 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2851 emit(MOV(swizzled_result, orig_val));
2852 return;
2853 }
2854
2855 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2856 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2857 emit(MOV(swizzled_result, orig_val));
2858 return;
2859 }
2860
2861
2862 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2863 int swizzle[4] = {0};
2864
2865 for (int i = 0; i < 4; i++) {
2866 switch (GET_SWZ(s, i)) {
2867 case SWIZZLE_ZERO:
2868 zero_mask |= (1 << i);
2869 break;
2870 case SWIZZLE_ONE:
2871 one_mask |= (1 << i);
2872 break;
2873 default:
2874 copy_mask |= (1 << i);
2875 swizzle[i] = GET_SWZ(s, i);
2876 break;
2877 }
2878 }
2879
2880 if (copy_mask) {
2881 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2882 swizzled_result.writemask = copy_mask;
2883 emit(MOV(swizzled_result, orig_val));
2884 }
2885
2886 if (zero_mask) {
2887 swizzled_result.writemask = zero_mask;
2888 emit(MOV(swizzled_result, src_reg(0.0f)));
2889 }
2890
2891 if (one_mask) {
2892 swizzled_result.writemask = one_mask;
2893 emit(MOV(swizzled_result, src_reg(1.0f)));
2894 }
2895 }
2896
2897 void
2898 vec4_visitor::visit(ir_return *)
2899 {
2900 unreachable("not reached");
2901 }
2902
2903 void
2904 vec4_visitor::visit(ir_discard *)
2905 {
2906 unreachable("not reached");
2907 }
2908
2909 void
2910 vec4_visitor::visit(ir_if *ir)
2911 {
2912 /* Don't point the annotation at the if statement, because then it plus
2913 * the then and else blocks get printed.
2914 */
2915 this->base_ir = ir->condition;
2916
2917 if (devinfo->gen == 6) {
2918 emit_if_gen6(ir);
2919 } else {
2920 enum brw_predicate predicate;
2921 emit_bool_to_cond_code(ir->condition, &predicate);
2922 emit(IF(predicate));
2923 }
2924
2925 visit_instructions(&ir->then_instructions);
2926
2927 if (!ir->else_instructions.is_empty()) {
2928 this->base_ir = ir->condition;
2929 emit(BRW_OPCODE_ELSE);
2930
2931 visit_instructions(&ir->else_instructions);
2932 }
2933
2934 this->base_ir = ir->condition;
2935 emit(BRW_OPCODE_ENDIF);
2936 }
2937
2938 void
2939 vec4_visitor::visit(ir_emit_vertex *)
2940 {
2941 unreachable("not reached");
2942 }
2943
2944 void
2945 vec4_visitor::visit(ir_end_primitive *)
2946 {
2947 unreachable("not reached");
2948 }
2949
2950 void
2951 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2952 dst_reg dst, src_reg offset,
2953 src_reg src0, src_reg src1)
2954 {
2955 unsigned mlen = 0;
2956
2957 /* Set the atomic operation offset. */
2958 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2959 mlen++;
2960
2961 /* Set the atomic operation arguments. */
2962 if (src0.file != BAD_FILE) {
2963 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2964 mlen++;
2965 }
2966
2967 if (src1.file != BAD_FILE) {
2968 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2969 mlen++;
2970 }
2971
2972 /* Emit the instruction. Note that this maps to the normal SIMD8
2973 * untyped atomic message on Ivy Bridge, but that's OK because
2974 * unused channels will be masked out.
2975 */
2976 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2977 brw_message_reg(0),
2978 src_reg(surf_index), src_reg(atomic_op));
2979 inst->mlen = mlen;
2980 }
2981
2982 void
2983 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2984 src_reg offset)
2985 {
2986 /* Set the surface read offset. */
2987 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2988
2989 /* Emit the instruction. Note that this maps to the normal SIMD8
2990 * untyped surface read message, but that's OK because unused
2991 * channels will be masked out.
2992 */
2993 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
2994 brw_message_reg(0),
2995 src_reg(surf_index), src_reg(1));
2996 inst->mlen = 1;
2997 }
2998
2999 void
3000 vec4_visitor::emit_ndc_computation()
3001 {
3002 /* Get the position */
3003 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3004
3005 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3006 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3007 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3008
3009 current_annotation = "NDC";
3010 dst_reg ndc_w = ndc;
3011 ndc_w.writemask = WRITEMASK_W;
3012 src_reg pos_w = pos;
3013 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3014 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3015
3016 dst_reg ndc_xyz = ndc;
3017 ndc_xyz.writemask = WRITEMASK_XYZ;
3018
3019 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3020 }
3021
3022 void
3023 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3024 {
3025 if (devinfo->gen < 6 &&
3026 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3027 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3028 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3029 dst_reg header1_w = header1;
3030 header1_w.writemask = WRITEMASK_W;
3031
3032 emit(MOV(header1, 0u));
3033
3034 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3035 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3036
3037 current_annotation = "Point size";
3038 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3039 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3040 }
3041
3042 if (key->userclip_active) {
3043 current_annotation = "Clipping flags";
3044 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3045 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3046
3047 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3048 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3049 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3050
3051 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3052 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3053 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3054 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3055 }
3056
3057 /* i965 clipping workaround:
3058 * 1) Test for -ve rhw
3059 * 2) If set,
3060 * set ndc = (0,0,0,0)
3061 * set ucp[6] = 1
3062 *
3063 * Later, clipping will detect ucp[6] and ensure the primitive is
3064 * clipped against all fixed planes.
3065 */
3066 if (devinfo->has_negative_rhw_bug) {
3067 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3068 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3069 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3070 vec4_instruction *inst;
3071 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3072 inst->predicate = BRW_PREDICATE_NORMAL;
3073 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3074 inst->predicate = BRW_PREDICATE_NORMAL;
3075 }
3076
3077 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3078 } else if (devinfo->gen < 6) {
3079 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3080 } else {
3081 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3082 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3083 dst_reg reg_w = reg;
3084 reg_w.writemask = WRITEMASK_W;
3085 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3086 }
3087 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3088 dst_reg reg_y = reg;
3089 reg_y.writemask = WRITEMASK_Y;
3090 reg_y.type = BRW_REGISTER_TYPE_D;
3091 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3092 }
3093 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3094 dst_reg reg_z = reg;
3095 reg_z.writemask = WRITEMASK_Z;
3096 reg_z.type = BRW_REGISTER_TYPE_D;
3097 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3098 }
3099 }
3100 }
3101
3102 void
3103 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3104 {
3105 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3106 *
3107 * "If a linked set of shaders forming the vertex stage contains no
3108 * static write to gl_ClipVertex or gl_ClipDistance, but the
3109 * application has requested clipping against user clip planes through
3110 * the API, then the coordinate written to gl_Position is used for
3111 * comparison against the user clip planes."
3112 *
3113 * This function is only called if the shader didn't write to
3114 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3115 * if the user wrote to it; otherwise we use gl_Position.
3116 */
3117 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3118 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3119 clip_vertex = VARYING_SLOT_POS;
3120 }
3121
3122 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3123 ++i) {
3124 reg.writemask = 1 << i;
3125 emit(DP4(reg,
3126 src_reg(output_reg[clip_vertex]),
3127 src_reg(this->userplane[i + offset])));
3128 }
3129 }
3130
3131 vec4_instruction *
3132 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3133 {
3134 assert (varying < VARYING_SLOT_MAX);
3135 reg.type = output_reg[varying].type;
3136 current_annotation = output_reg_annotation[varying];
3137 /* Copy the register, saturating if necessary */
3138 return emit(MOV(reg, src_reg(output_reg[varying])));
3139 }
3140
3141 void
3142 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3143 {
3144 reg.type = BRW_REGISTER_TYPE_F;
3145
3146 switch (varying) {
3147 case VARYING_SLOT_PSIZ:
3148 {
3149 /* PSIZ is always in slot 0, and is coupled with other flags. */
3150 current_annotation = "indices, point width, clip flags";
3151 emit_psiz_and_flags(reg);
3152 break;
3153 }
3154 case BRW_VARYING_SLOT_NDC:
3155 current_annotation = "NDC";
3156 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3157 break;
3158 case VARYING_SLOT_POS:
3159 current_annotation = "gl_Position";
3160 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3161 break;
3162 case VARYING_SLOT_EDGE:
3163 /* This is present when doing unfilled polygons. We're supposed to copy
3164 * the edge flag from the user-provided vertex array
3165 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3166 * of that attribute (starts as 1.0f). This is then used in clipping to
3167 * determine which edges should be drawn as wireframe.
3168 */
3169 current_annotation = "edge flag";
3170 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3171 glsl_type::float_type, WRITEMASK_XYZW))));
3172 break;
3173 case BRW_VARYING_SLOT_PAD:
3174 /* No need to write to this slot */
3175 break;
3176 case VARYING_SLOT_COL0:
3177 case VARYING_SLOT_COL1:
3178 case VARYING_SLOT_BFC0:
3179 case VARYING_SLOT_BFC1: {
3180 /* These built-in varyings are only supported in compatibility mode,
3181 * and we only support GS in core profile. So, this must be a vertex
3182 * shader.
3183 */
3184 assert(stage == MESA_SHADER_VERTEX);
3185 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3186 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3187 inst->saturate = true;
3188 break;
3189 }
3190
3191 default:
3192 emit_generic_urb_slot(reg, varying);
3193 break;
3194 }
3195 }
3196
3197 static int
3198 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3199 {
3200 if (devinfo->gen >= 6) {
3201 /* URB data written (does not include the message header reg) must
3202 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3203 * section 5.4.3.2.2: URB_INTERLEAVED.
3204 *
3205 * URB entries are allocated on a multiple of 1024 bits, so an
3206 * extra 128 bits written here to make the end align to 256 is
3207 * no problem.
3208 */
3209 if ((mlen % 2) != 1)
3210 mlen++;
3211 }
3212
3213 return mlen;
3214 }
3215
3216
3217 /**
3218 * Generates the VUE payload plus the necessary URB write instructions to
3219 * output it.
3220 *
3221 * The VUE layout is documented in Volume 2a.
3222 */
3223 void
3224 vec4_visitor::emit_vertex()
3225 {
3226 /* MRF 0 is reserved for the debugger, so start with message header
3227 * in MRF 1.
3228 */
3229 int base_mrf = 1;
3230 int mrf = base_mrf;
3231 /* In the process of generating our URB write message contents, we
3232 * may need to unspill a register or load from an array. Those
3233 * reads would use MRFs 14-15.
3234 */
3235 int max_usable_mrf = 13;
3236
3237 /* The following assertion verifies that max_usable_mrf causes an
3238 * even-numbered amount of URB write data, which will meet gen6's
3239 * requirements for length alignment.
3240 */
3241 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3242
3243 /* First mrf is the g0-based message header containing URB handles and
3244 * such.
3245 */
3246 emit_urb_write_header(mrf++);
3247
3248 if (devinfo->gen < 6) {
3249 emit_ndc_computation();
3250 }
3251
3252 /* Lower legacy ff and ClipVertex clipping to clip distances */
3253 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3254 current_annotation = "user clip distances";
3255
3256 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3257 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3258
3259 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3260 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3261 }
3262
3263 /* We may need to split this up into several URB writes, so do them in a
3264 * loop.
3265 */
3266 int slot = 0;
3267 bool complete = false;
3268 do {
3269 /* URB offset is in URB row increments, and each of our MRFs is half of
3270 * one of those, since we're doing interleaved writes.
3271 */
3272 int offset = slot / 2;
3273
3274 mrf = base_mrf + 1;
3275 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3276 emit_urb_slot(dst_reg(MRF, mrf++),
3277 prog_data->vue_map.slot_to_varying[slot]);
3278
3279 /* If this was max_usable_mrf, we can't fit anything more into this
3280 * URB WRITE.
3281 */
3282 if (mrf > max_usable_mrf) {
3283 slot++;
3284 break;
3285 }
3286 }
3287
3288 complete = slot >= prog_data->vue_map.num_slots;
3289 current_annotation = "URB write";
3290 vec4_instruction *inst = emit_urb_write_opcode(complete);
3291 inst->base_mrf = base_mrf;
3292 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3293 inst->offset += offset;
3294 } while(!complete);
3295 }
3296
3297
3298 src_reg
3299 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3300 src_reg *reladdr, int reg_offset)
3301 {
3302 /* Because we store the values to scratch interleaved like our
3303 * vertex data, we need to scale the vec4 index by 2.
3304 */
3305 int message_header_scale = 2;
3306
3307 /* Pre-gen6, the message header uses byte offsets instead of vec4
3308 * (16-byte) offset units.
3309 */
3310 if (devinfo->gen < 6)
3311 message_header_scale *= 16;
3312
3313 if (reladdr) {
3314 src_reg index = src_reg(this, glsl_type::int_type);
3315
3316 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3317 src_reg(reg_offset)));
3318 emit_before(block, inst, MUL(dst_reg(index), index,
3319 src_reg(message_header_scale)));
3320
3321 return index;
3322 } else {
3323 return src_reg(reg_offset * message_header_scale);
3324 }
3325 }
3326
3327 src_reg
3328 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3329 src_reg *reladdr, int reg_offset)
3330 {
3331 if (reladdr) {
3332 src_reg index = src_reg(this, glsl_type::int_type);
3333
3334 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3335 src_reg(reg_offset)));
3336
3337 /* Pre-gen6, the message header uses byte offsets instead of vec4
3338 * (16-byte) offset units.
3339 */
3340 if (devinfo->gen < 6) {
3341 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3342 }
3343
3344 return index;
3345 } else if (devinfo->gen >= 8) {
3346 /* Store the offset in a GRF so we can send-from-GRF. */
3347 src_reg offset = src_reg(this, glsl_type::int_type);
3348 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3349 return offset;
3350 } else {
3351 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3352 return src_reg(reg_offset * message_header_scale);
3353 }
3354 }
3355
3356 /**
3357 * Emits an instruction before @inst to load the value named by @orig_src
3358 * from scratch space at @base_offset to @temp.
3359 *
3360 * @base_offset is measured in 32-byte units (the size of a register).
3361 */
3362 void
3363 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3364 dst_reg temp, src_reg orig_src,
3365 int base_offset)
3366 {
3367 int reg_offset = base_offset + orig_src.reg_offset;
3368 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3369 reg_offset);
3370
3371 emit_before(block, inst, SCRATCH_READ(temp, index));
3372 }
3373
3374 /**
3375 * Emits an instruction after @inst to store the value to be written
3376 * to @orig_dst to scratch space at @base_offset, from @temp.
3377 *
3378 * @base_offset is measured in 32-byte units (the size of a register).
3379 */
3380 void
3381 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3382 int base_offset)
3383 {
3384 int reg_offset = base_offset + inst->dst.reg_offset;
3385 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3386 reg_offset);
3387
3388 /* Create a temporary register to store *inst's result in.
3389 *
3390 * We have to be careful in MOVing from our temporary result register in
3391 * the scratch write. If we swizzle from channels of the temporary that
3392 * weren't initialized, it will confuse live interval analysis, which will
3393 * make spilling fail to make progress.
3394 */
3395 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3396 inst->dst.type),
3397 brw_swizzle_for_mask(inst->dst.writemask));
3398 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3399 inst->dst.writemask));
3400 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3401 write->predicate = inst->predicate;
3402 write->ir = inst->ir;
3403 write->annotation = inst->annotation;
3404 inst->insert_after(block, write);
3405
3406 inst->dst.file = temp.file;
3407 inst->dst.reg = temp.reg;
3408 inst->dst.reg_offset = temp.reg_offset;
3409 inst->dst.reladdr = NULL;
3410 }
3411
3412 /**
3413 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3414 * adds the scratch read(s) before \p inst. The function also checks for
3415 * recursive reladdr scratch accesses, issuing the corresponding scratch
3416 * loads and rewriting reladdr references accordingly.
3417 *
3418 * \return \p src if it did not require a scratch load, otherwise, the
3419 * register holding the result of the scratch load that the caller should
3420 * use to rewrite src.
3421 */
3422 src_reg
3423 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3424 vec4_instruction *inst, src_reg src)
3425 {
3426 /* Resolve recursive reladdr scratch access by calling ourselves
3427 * with src.reladdr
3428 */
3429 if (src.reladdr)
3430 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3431 *src.reladdr);
3432
3433 /* Now handle scratch access on src */
3434 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3435 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3436 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3437 src.reg = temp.reg;
3438 src.reg_offset = temp.reg_offset;
3439 src.reladdr = NULL;
3440 }
3441
3442 return src;
3443 }
3444
3445 /**
3446 * We can't generally support array access in GRF space, because a
3447 * single instruction's destination can only span 2 contiguous
3448 * registers. So, we send all GRF arrays that get variable index
3449 * access to scratch space.
3450 */
3451 void
3452 vec4_visitor::move_grf_array_access_to_scratch()
3453 {
3454 int scratch_loc[this->alloc.count];
3455 memset(scratch_loc, -1, sizeof(scratch_loc));
3456
3457 /* First, calculate the set of virtual GRFs that need to be punted
3458 * to scratch due to having any array access on them, and where in
3459 * scratch.
3460 */
3461 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3462 if (inst->dst.file == GRF && inst->dst.reladdr) {
3463 if (scratch_loc[inst->dst.reg] == -1) {
3464 scratch_loc[inst->dst.reg] = c->last_scratch;
3465 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3466 }
3467
3468 for (src_reg *iter = inst->dst.reladdr;
3469 iter->reladdr;
3470 iter = iter->reladdr) {
3471 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3472 scratch_loc[iter->reg] = c->last_scratch;
3473 c->last_scratch += this->alloc.sizes[iter->reg];
3474 }
3475 }
3476 }
3477
3478 for (int i = 0 ; i < 3; i++) {
3479 for (src_reg *iter = &inst->src[i];
3480 iter->reladdr;
3481 iter = iter->reladdr) {
3482 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3483 scratch_loc[iter->reg] = c->last_scratch;
3484 c->last_scratch += this->alloc.sizes[iter->reg];
3485 }
3486 }
3487 }
3488 }
3489
3490 /* Now, for anything that will be accessed through scratch, rewrite
3491 * it to load/store. Note that this is a _safe list walk, because
3492 * we may generate a new scratch_write instruction after the one
3493 * we're processing.
3494 */
3495 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3496 /* Set up the annotation tracking for new generated instructions. */
3497 base_ir = inst->ir;
3498 current_annotation = inst->annotation;
3499
3500 /* First handle scratch access on the dst. Notice we have to handle
3501 * the case where the dst's reladdr also points to scratch space.
3502 */
3503 if (inst->dst.reladdr)
3504 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3505 *inst->dst.reladdr);
3506
3507 /* Now that we have handled any (possibly recursive) reladdr scratch
3508 * accesses for dst we can safely do the scratch write for dst itself
3509 */
3510 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3511 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3512
3513 /* Now handle scratch access on any src. In this case, since inst->src[i]
3514 * already is a src_reg, we can just call emit_resolve_reladdr with
3515 * inst->src[i] and it will take care of handling scratch loads for
3516 * both src and src.reladdr (recursively).
3517 */
3518 for (int i = 0 ; i < 3; i++) {
3519 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3520 inst->src[i]);
3521 }
3522 }
3523 }
3524
3525 /**
3526 * Emits an instruction before @inst to load the value named by @orig_src
3527 * from the pull constant buffer (surface) at @base_offset to @temp.
3528 */
3529 void
3530 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3531 dst_reg temp, src_reg orig_src,
3532 int base_offset)
3533 {
3534 int reg_offset = base_offset + orig_src.reg_offset;
3535 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3536 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3537 reg_offset);
3538
3539 emit_pull_constant_load_reg(temp,
3540 index,
3541 offset,
3542 block, inst);
3543 }
3544
3545 /**
3546 * Implements array access of uniforms by inserting a
3547 * PULL_CONSTANT_LOAD instruction.
3548 *
3549 * Unlike temporary GRF array access (where we don't support it due to
3550 * the difficulty of doing relative addressing on instruction
3551 * destinations), we could potentially do array access of uniforms
3552 * that were loaded in GRF space as push constants. In real-world
3553 * usage we've seen, though, the arrays being used are always larger
3554 * than we could load as push constants, so just always move all
3555 * uniform array access out to a pull constant buffer.
3556 */
3557 void
3558 vec4_visitor::move_uniform_array_access_to_pull_constants()
3559 {
3560 int pull_constant_loc[this->uniforms];
3561 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3562 bool nested_reladdr;
3563
3564 /* Walk through and find array access of uniforms. Put a copy of that
3565 * uniform in the pull constant buffer.
3566 *
3567 * Note that we don't move constant-indexed accesses to arrays. No
3568 * testing has been done of the performance impact of this choice.
3569 */
3570 do {
3571 nested_reladdr = false;
3572
3573 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3574 for (int i = 0 ; i < 3; i++) {
3575 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3576 continue;
3577
3578 int uniform = inst->src[i].reg;
3579
3580 if (inst->src[i].reladdr->reladdr)
3581 nested_reladdr = true; /* will need another pass */
3582
3583 /* If this array isn't already present in the pull constant buffer,
3584 * add it.
3585 */
3586 if (pull_constant_loc[uniform] == -1) {
3587 const gl_constant_value **values =
3588 &stage_prog_data->param[uniform * 4];
3589
3590 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3591
3592 assert(uniform < uniform_array_size);
3593 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3594 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3595 = values[j];
3596 }
3597 }
3598
3599 /* Set up the annotation tracking for new generated instructions. */
3600 base_ir = inst->ir;
3601 current_annotation = inst->annotation;
3602
3603 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3604
3605 emit_pull_constant_load(block, inst, temp, inst->src[i],
3606 pull_constant_loc[uniform]);
3607
3608 inst->src[i].file = temp.file;
3609 inst->src[i].reg = temp.reg;
3610 inst->src[i].reg_offset = temp.reg_offset;
3611 inst->src[i].reladdr = NULL;
3612 }
3613 }
3614 } while (nested_reladdr);
3615
3616 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3617 * no need to track them as larger-than-vec4 objects. This will be
3618 * relied on in cutting out unused uniform vectors from push
3619 * constants.
3620 */
3621 split_uniform_registers();
3622 }
3623
3624 void
3625 vec4_visitor::resolve_ud_negate(src_reg *reg)
3626 {
3627 if (reg->type != BRW_REGISTER_TYPE_UD ||
3628 !reg->negate)
3629 return;
3630
3631 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3632 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3633 *reg = temp;
3634 }
3635
3636 /**
3637 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3638 *
3639 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3640 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3641 */
3642 void
3643 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3644 {
3645 assert(devinfo->gen <= 5);
3646
3647 if (!rvalue->type->is_boolean())
3648 return;
3649
3650 src_reg and_result = src_reg(this, rvalue->type);
3651 src_reg neg_result = src_reg(this, rvalue->type);
3652 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3653 emit(MOV(dst_reg(neg_result), negate(and_result)));
3654 *reg = neg_result;
3655 }
3656
3657 vec4_visitor::vec4_visitor(struct brw_context *brw,
3658 struct brw_vec4_compile *c,
3659 struct gl_program *prog,
3660 const struct brw_vue_prog_key *key,
3661 struct brw_vue_prog_data *prog_data,
3662 struct gl_shader_program *shader_prog,
3663 gl_shader_stage stage,
3664 void *mem_ctx,
3665 bool no_spills,
3666 shader_time_shader_type st_base,
3667 shader_time_shader_type st_written,
3668 shader_time_shader_type st_reset)
3669 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3670 c(c),
3671 key(key),
3672 prog_data(prog_data),
3673 sanity_param_count(0),
3674 fail_msg(NULL),
3675 first_non_payload_grf(0),
3676 need_all_constants_in_pull_buffer(false),
3677 no_spills(no_spills),
3678 st_base(st_base),
3679 st_written(st_written),
3680 st_reset(st_reset)
3681 {
3682 this->mem_ctx = mem_ctx;
3683 this->failed = false;
3684
3685 this->base_ir = NULL;
3686 this->current_annotation = NULL;
3687 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3688
3689 this->variable_ht = hash_table_ctor(0,
3690 hash_table_pointer_hash,
3691 hash_table_pointer_compare);
3692
3693 this->virtual_grf_start = NULL;
3694 this->virtual_grf_end = NULL;
3695 this->live_intervals = NULL;
3696
3697 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3698
3699 this->uniforms = 0;
3700
3701 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3702 * at least one. See setup_uniforms() in brw_vec4.cpp.
3703 */
3704 this->uniform_array_size = 1;
3705 if (prog_data) {
3706 this->uniform_array_size =
3707 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3708 }
3709
3710 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3711 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3712 }
3713
3714 vec4_visitor::~vec4_visitor()
3715 {
3716 hash_table_dtor(this->variable_ht);
3717 }
3718
3719
3720 void
3721 vec4_visitor::fail(const char *format, ...)
3722 {
3723 va_list va;
3724 char *msg;
3725
3726 if (failed)
3727 return;
3728
3729 failed = true;
3730
3731 va_start(va, format);
3732 msg = ralloc_vasprintf(mem_ctx, format, va);
3733 va_end(va);
3734 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3735
3736 this->fail_msg = msg;
3737
3738 if (debug_enabled) {
3739 fprintf(stderr, "%s", msg);
3740 }
3741 }
3742
3743 } /* namespace brw */