i965: Add a devinfo field to backend_visitor and use it for gen checks
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (strncmp(ir->name, storage->name, namelen) != 0 ||
690 (storage->name[namelen] != 0 &&
691 storage->name[namelen] != '.' &&
692 storage->name[namelen] != '[')) {
693 continue;
694 }
695
696 gl_constant_value *components = storage->storage;
697 unsigned vector_count = (MAX2(storage->array_elements, 1) *
698 storage->type->matrix_columns);
699
700 for (unsigned s = 0; s < vector_count; s++) {
701 assert(uniforms < uniform_array_size);
702 uniform_vector_size[uniforms] = storage->type->vector_elements;
703
704 int i;
705 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
706 stage_prog_data->param[uniforms * 4 + i] = components;
707 components++;
708 }
709 for (; i < 4; i++) {
710 static gl_constant_value zero = { 0.0 };
711 stage_prog_data->param[uniforms * 4 + i] = &zero;
712 }
713
714 uniforms++;
715 }
716 }
717 }
718
719 void
720 vec4_visitor::setup_uniform_clipplane_values()
721 {
722 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
723
724 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 4;
727 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
728 this->userplane[i].type = BRW_REGISTER_TYPE_F;
729 for (int j = 0; j < 4; ++j) {
730 stage_prog_data->param[this->uniforms * 4 + j] =
731 (gl_constant_value *) &clip_planes[i][j];
732 }
733 ++this->uniforms;
734 }
735 }
736
737 /* Our support for builtin uniforms is even scarier than non-builtin.
738 * It sits on top of the PROG_STATE_VAR parameters that are
739 * automatically updated from GL context state.
740 */
741 void
742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
743 {
744 const ir_state_slot *const slots = ir->get_state_slots();
745 assert(slots != NULL);
746
747 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
748 /* This state reference has already been setup by ir_to_mesa,
749 * but we'll get the same index back here. We can reference
750 * ParameterValues directly, since unlike brw_fs.cpp, we never
751 * add new state references during compile.
752 */
753 int index = _mesa_add_state_reference(this->prog->Parameters,
754 (gl_state_index *)slots[i].tokens);
755 gl_constant_value *values =
756 &this->prog->Parameters->ParameterValues[index][0];
757
758 assert(this->uniforms < uniform_array_size);
759
760 for (unsigned j = 0; j < 4; j++)
761 stage_prog_data->param[this->uniforms * 4 + j] =
762 &values[GET_SWZ(slots[i].swizzle, j)];
763
764 this->uniform_vector_size[this->uniforms] =
765 (ir->type->is_scalar() || ir->type->is_vector() ||
766 ir->type->is_matrix() ? ir->type->vector_elements : 4);
767
768 this->uniforms++;
769 }
770 }
771
772 dst_reg *
773 vec4_visitor::variable_storage(ir_variable *var)
774 {
775 return (dst_reg *)hash_table_find(this->variable_ht, var);
776 }
777
778 void
779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
780 enum brw_predicate *predicate)
781 {
782 ir_expression *expr = ir->as_expression();
783
784 *predicate = BRW_PREDICATE_NORMAL;
785
786 if (expr && expr->operation != ir_binop_ubo_load) {
787 src_reg op[3];
788 vec4_instruction *inst;
789
790 assert(expr->get_num_operands() <= 3);
791 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
792 expr->operands[i]->accept(this);
793 op[i] = this->result;
794
795 resolve_ud_negate(&op[i]);
796 }
797
798 switch (expr->operation) {
799 case ir_unop_logic_not:
800 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
801 inst->conditional_mod = BRW_CONDITIONAL_Z;
802 break;
803
804 case ir_binop_logic_xor:
805 if (devinfo->gen <= 5) {
806 src_reg temp = src_reg(this, ir->type);
807 emit(XOR(dst_reg(temp), op[0], op[1]));
808 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
809 } else {
810 inst = emit(XOR(dst_null_d(), op[0], op[1]));
811 }
812 inst->conditional_mod = BRW_CONDITIONAL_NZ;
813 break;
814
815 case ir_binop_logic_or:
816 if (devinfo->gen <= 5) {
817 src_reg temp = src_reg(this, ir->type);
818 emit(OR(dst_reg(temp), op[0], op[1]));
819 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
820 } else {
821 inst = emit(OR(dst_null_d(), op[0], op[1]));
822 }
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_binop_logic_and:
827 if (devinfo->gen <= 5) {
828 src_reg temp = src_reg(this, ir->type);
829 emit(AND(dst_reg(temp), op[0], op[1]));
830 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
831 } else {
832 inst = emit(AND(dst_null_d(), op[0], op[1]));
833 }
834 inst->conditional_mod = BRW_CONDITIONAL_NZ;
835 break;
836
837 case ir_unop_f2b:
838 if (devinfo->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_f(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_unop_i2b:
847 if (devinfo->gen >= 6) {
848 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
849 } else {
850 inst = emit(MOV(dst_null_d(), op[0]));
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 }
853 break;
854
855 case ir_binop_all_equal:
856 if (devinfo->gen <= 5) {
857 resolve_bool_comparison(expr->operands[0], &op[0]);
858 resolve_bool_comparison(expr->operands[1], &op[1]);
859 }
860 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
861 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
862 break;
863
864 case ir_binop_any_nequal:
865 if (devinfo->gen <= 5) {
866 resolve_bool_comparison(expr->operands[0], &op[0]);
867 resolve_bool_comparison(expr->operands[1], &op[1]);
868 }
869 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
870 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
871 break;
872
873 case ir_unop_any:
874 if (devinfo->gen <= 5) {
875 resolve_bool_comparison(expr->operands[0], &op[0]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
878 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
879 break;
880
881 case ir_binop_greater:
882 case ir_binop_gequal:
883 case ir_binop_less:
884 case ir_binop_lequal:
885 case ir_binop_equal:
886 case ir_binop_nequal:
887 if (devinfo->gen <= 5) {
888 resolve_bool_comparison(expr->operands[0], &op[0]);
889 resolve_bool_comparison(expr->operands[1], &op[1]);
890 }
891 emit(CMP(dst_null_d(), op[0], op[1],
892 brw_conditional_for_comparison(expr->operation)));
893 break;
894
895 case ir_triop_csel: {
896 /* Expand the boolean condition into the flag register. */
897 inst = emit(MOV(dst_null_d(), op[0]));
898 inst->conditional_mod = BRW_CONDITIONAL_NZ;
899
900 /* Select which boolean to return. */
901 dst_reg temp(this, expr->operands[1]->type);
902 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
903 inst->predicate = BRW_PREDICATE_NORMAL;
904
905 /* Expand the result to a condition code. */
906 inst = emit(MOV(dst_null_d(), src_reg(temp)));
907 inst->conditional_mod = BRW_CONDITIONAL_NZ;
908 break;
909 }
910
911 default:
912 unreachable("not reached");
913 }
914 return;
915 }
916
917 ir->accept(this);
918
919 resolve_ud_negate(&this->result);
920
921 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923 }
924
925 /**
926 * Emit a gen6 IF statement with the comparison folded into the IF
927 * instruction.
928 */
929 void
930 vec4_visitor::emit_if_gen6(ir_if *ir)
931 {
932 ir_expression *expr = ir->condition->as_expression();
933
934 if (expr && expr->operation != ir_binop_ubo_load) {
935 src_reg op[3];
936 dst_reg temp;
937
938 assert(expr->get_num_operands() <= 3);
939 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
940 expr->operands[i]->accept(this);
941 op[i] = this->result;
942 }
943
944 switch (expr->operation) {
945 case ir_unop_logic_not:
946 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
947 return;
948
949 case ir_binop_logic_xor:
950 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
951 return;
952
953 case ir_binop_logic_or:
954 temp = dst_reg(this, glsl_type::bool_type);
955 emit(OR(temp, op[0], op[1]));
956 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958
959 case ir_binop_logic_and:
960 temp = dst_reg(this, glsl_type::bool_type);
961 emit(AND(temp, op[0], op[1]));
962 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
963 return;
964
965 case ir_unop_f2b:
966 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_i2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_binop_greater:
974 case ir_binop_gequal:
975 case ir_binop_less:
976 case ir_binop_lequal:
977 case ir_binop_equal:
978 case ir_binop_nequal:
979 emit(IF(op[0], op[1],
980 brw_conditional_for_comparison(expr->operation)));
981 return;
982
983 case ir_binop_all_equal:
984 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
985 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
986 return;
987
988 case ir_binop_any_nequal:
989 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
990 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
991 return;
992
993 case ir_unop_any:
994 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
995 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
996 return;
997
998 case ir_triop_csel: {
999 /* Expand the boolean condition into the flag register. */
1000 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003 /* Select which boolean to return. */
1004 dst_reg temp(this, expr->operands[1]->type);
1005 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006 inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009 return;
1010 }
1011
1012 default:
1013 unreachable("not reached");
1014 }
1015 return;
1016 }
1017
1018 ir->condition->accept(this);
1019
1020 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026 dst_reg *reg = NULL;
1027
1028 if (variable_storage(ir))
1029 return;
1030
1031 switch (ir->data.mode) {
1032 case ir_var_shader_in:
1033 assert(ir->data.location != -1);
1034 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035 break;
1036
1037 case ir_var_shader_out:
1038 assert(ir->data.location != -1);
1039 reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041 for (int i = 0; i < type_size(ir->type); i++) {
1042 output_reg[ir->data.location + i] = *reg;
1043 output_reg[ir->data.location + i].reg_offset = i;
1044 output_reg[ir->data.location + i].type =
1045 brw_type_for_base_type(ir->type->get_scalar_type());
1046 output_reg_annotation[ir->data.location + i] = ir->name;
1047 }
1048 break;
1049
1050 case ir_var_auto:
1051 case ir_var_temporary:
1052 reg = new(mem_ctx) dst_reg(this, ir->type);
1053 break;
1054
1055 case ir_var_uniform:
1056 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058 /* Thanks to the lower_ubo_reference pass, we will see only
1059 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060 * variables, so no need for them to be in variable_ht.
1061 *
1062 * Some uniforms, such as samplers and atomic counters, have no actual
1063 * storage, so we should ignore them.
1064 */
1065 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066 return;
1067
1068 /* Track how big the whole uniform variable is, in case we need to put a
1069 * copy of its data into pull constants for array access.
1070 */
1071 assert(this->uniforms < uniform_array_size);
1072 this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074 if (!strncmp(ir->name, "gl_", 3)) {
1075 setup_builtin_uniform_values(ir);
1076 } else {
1077 setup_uniform_values(ir);
1078 }
1079 break;
1080
1081 case ir_var_system_value:
1082 reg = make_reg_for_system_value(ir);
1083 break;
1084
1085 default:
1086 unreachable("not reached");
1087 }
1088
1089 reg->type = brw_type_for_base_type(ir->type);
1090 hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096 /* We don't want debugging output to print the whole body of the
1097 * loop as the annotation.
1098 */
1099 this->base_ir = NULL;
1100
1101 emit(BRW_OPCODE_DO);
1102
1103 visit_instructions(&ir->body_instructions);
1104
1105 emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111 switch (ir->mode) {
1112 case ir_loop_jump::jump_break:
1113 emit(BRW_OPCODE_BREAK);
1114 break;
1115 case ir_loop_jump::jump_continue:
1116 emit(BRW_OPCODE_CONTINUE);
1117 break;
1118 }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125 unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131 /* Ignore function bodies other than main() -- we shouldn't see calls to
1132 * them since they should all be inlined.
1133 */
1134 if (strcmp(ir->name, "main") == 0) {
1135 const ir_function_signature *sig;
1136 exec_list empty;
1137
1138 sig = ir->matching_signature(NULL, &empty, false);
1139
1140 assert(sig);
1141
1142 visit_instructions(&sig->body);
1143 }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149 /* 3-src instructions were introduced in gen6. */
1150 if (devinfo->gen < 6)
1151 return false;
1152
1153 /* MAD can only handle floating-point data. */
1154 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155 return false;
1156
1157 ir_rvalue *nonmul;
1158 ir_expression *mul;
1159 bool mul_negate, mul_abs;
1160
1161 for (int i = 0; i < 2; i++) {
1162 mul_negate = false;
1163 mul_abs = false;
1164
1165 mul = ir->operands[i]->as_expression();
1166 nonmul = ir->operands[1 - i];
1167
1168 if (mul && mul->operation == ir_unop_abs) {
1169 mul = mul->operands[0]->as_expression();
1170 mul_abs = true;
1171 } else if (mul && mul->operation == ir_unop_neg) {
1172 mul = mul->operands[0]->as_expression();
1173 mul_negate = true;
1174 }
1175
1176 if (mul && mul->operation == ir_binop_mul)
1177 break;
1178 }
1179
1180 if (!mul || mul->operation != ir_binop_mul)
1181 return false;
1182
1183 nonmul->accept(this);
1184 src_reg src0 = fix_3src_operand(this->result);
1185
1186 mul->operands[0]->accept(this);
1187 src_reg src1 = fix_3src_operand(this->result);
1188 src1.negate ^= mul_negate;
1189 src1.abs = mul_abs;
1190 if (mul_abs)
1191 src1.negate = false;
1192
1193 mul->operands[1]->accept(this);
1194 src_reg src2 = fix_3src_operand(this->result);
1195 src2.abs = mul_abs;
1196 if (mul_abs)
1197 src2.negate = false;
1198
1199 this->result = src_reg(this, ir->type);
1200 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202 return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208 /* This optimization relies on CMP setting the destination to 0 when
1209 * false. Early hardware only sets the least significant bit, and
1210 * leaves the other bits undefined. So we can't use it.
1211 */
1212 if (devinfo->gen < 6)
1213 return false;
1214
1215 ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217 if (cmp == NULL)
1218 return false;
1219
1220 switch (cmp->operation) {
1221 case ir_binop_less:
1222 case ir_binop_greater:
1223 case ir_binop_lequal:
1224 case ir_binop_gequal:
1225 case ir_binop_equal:
1226 case ir_binop_nequal:
1227 break;
1228
1229 default:
1230 return false;
1231 }
1232
1233 cmp->operands[0]->accept(this);
1234 const src_reg cmp_src0 = this->result;
1235
1236 cmp->operands[1]->accept(this);
1237 const src_reg cmp_src1 = this->result;
1238
1239 this->result = src_reg(this, ir->type);
1240
1241 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242 brw_conditional_for_comparison(cmp->operation)));
1243
1244 /* If the comparison is false, this->result will just happen to be zero.
1245 */
1246 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247 this->result, src_reg(1.0f));
1248 inst->predicate = BRW_PREDICATE_NORMAL;
1249 inst->predicate_inverse = true;
1250
1251 return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256 src_reg src0, src_reg src1)
1257 {
1258 vec4_instruction *inst;
1259
1260 if (devinfo->gen >= 6) {
1261 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262 inst->conditional_mod = conditionalmod;
1263 } else {
1264 emit(CMP(dst, src0, src1, conditionalmod));
1265
1266 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267 inst->predicate = BRW_PREDICATE_NORMAL;
1268 }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273 const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275 if (devinfo->gen >= 6) {
1276 /* Note that the instruction's argument order is reversed from GLSL
1277 * and the IR.
1278 */
1279 emit(LRP(dst,
1280 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281 } else {
1282 /* Earlier generations don't support three source operations, so we
1283 * need to emit x*(1-a) + y*a.
1284 */
1285 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1286 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1287 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288 y_times_a.writemask = dst.writemask;
1289 one_minus_a.writemask = dst.writemask;
1290 x_times_one_minus_a.writemask = dst.writemask;
1291
1292 emit(MUL(y_times_a, y, a));
1293 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296 }
1297 }
1298
1299 /**
1300 * Emits the instructions needed to perform a pull constant load. before_block
1301 * and before_inst can be NULL in which case the instruction will be appended
1302 * to the end of the instruction list.
1303 */
1304 void
1305 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1306 src_reg surf_index,
1307 src_reg offset_reg,
1308 bblock_t *before_block,
1309 vec4_instruction *before_inst)
1310 {
1311 assert((before_inst == NULL && before_block == NULL) ||
1312 (before_inst && before_block));
1313
1314 vec4_instruction *pull;
1315
1316 if (devinfo->gen >= 9) {
1317 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1318 src_reg header(this, glsl_type::uvec4_type, 2);
1319
1320 pull = new(mem_ctx)
1321 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1322 dst_reg(header));
1323
1324 if (before_inst)
1325 emit_before(before_block, before_inst, pull);
1326 else
1327 emit(pull);
1328
1329 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1330 offset_reg.type);
1331 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1332
1333 if (before_inst)
1334 emit_before(before_block, before_inst, pull);
1335 else
1336 emit(pull);
1337
1338 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1339 dst,
1340 surf_index,
1341 header);
1342 pull->mlen = 2;
1343 pull->header_present = true;
1344 } else if (devinfo->gen >= 7) {
1345 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1346
1347 grf_offset.type = offset_reg.type;
1348
1349 pull = MOV(grf_offset, offset_reg);
1350
1351 if (before_inst)
1352 emit_before(before_block, before_inst, pull);
1353 else
1354 emit(pull);
1355
1356 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1357 dst,
1358 surf_index,
1359 src_reg(grf_offset));
1360 pull->mlen = 1;
1361 } else {
1362 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1363 dst,
1364 surf_index,
1365 offset_reg);
1366 pull->base_mrf = 14;
1367 pull->mlen = 1;
1368 }
1369
1370 if (before_inst)
1371 emit_before(before_block, before_inst, pull);
1372 else
1373 emit(pull);
1374 }
1375
1376 void
1377 vec4_visitor::visit(ir_expression *ir)
1378 {
1379 unsigned int operand;
1380 src_reg op[ARRAY_SIZE(ir->operands)];
1381 vec4_instruction *inst;
1382
1383 if (ir->operation == ir_binop_add) {
1384 if (try_emit_mad(ir))
1385 return;
1386 }
1387
1388 if (ir->operation == ir_unop_b2f) {
1389 if (try_emit_b2f_of_compare(ir))
1390 return;
1391 }
1392
1393 /* Storage for our result. Ideally for an assignment we'd be using
1394 * the actual storage for the result here, instead.
1395 */
1396 dst_reg result_dst(this, ir->type);
1397 src_reg result_src(result_dst);
1398
1399 if (ir->operation == ir_triop_csel) {
1400 ir->operands[1]->accept(this);
1401 op[1] = this->result;
1402 ir->operands[2]->accept(this);
1403 op[2] = this->result;
1404
1405 enum brw_predicate predicate;
1406 emit_bool_to_cond_code(ir->operands[0], &predicate);
1407 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1408 inst->predicate = predicate;
1409 this->result = result_src;
1410 return;
1411 }
1412
1413 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1414 this->result.file = BAD_FILE;
1415 ir->operands[operand]->accept(this);
1416 if (this->result.file == BAD_FILE) {
1417 fprintf(stderr, "Failed to get tree for expression operand:\n");
1418 ir->operands[operand]->fprint(stderr);
1419 exit(1);
1420 }
1421 op[operand] = this->result;
1422
1423 /* Matrix expression operands should have been broken down to vector
1424 * operations already.
1425 */
1426 assert(!ir->operands[operand]->type->is_matrix());
1427 }
1428
1429 /* If nothing special happens, this is the result. */
1430 this->result = result_src;
1431
1432 switch (ir->operation) {
1433 case ir_unop_logic_not:
1434 emit(NOT(result_dst, op[0]));
1435 break;
1436 case ir_unop_neg:
1437 op[0].negate = !op[0].negate;
1438 emit(MOV(result_dst, op[0]));
1439 break;
1440 case ir_unop_abs:
1441 op[0].abs = true;
1442 op[0].negate = false;
1443 emit(MOV(result_dst, op[0]));
1444 break;
1445
1446 case ir_unop_sign:
1447 if (ir->type->is_float()) {
1448 /* AND(val, 0x80000000) gives the sign bit.
1449 *
1450 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1451 * zero.
1452 */
1453 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1454
1455 op[0].type = BRW_REGISTER_TYPE_UD;
1456 result_dst.type = BRW_REGISTER_TYPE_UD;
1457 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1458
1459 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1460 inst->predicate = BRW_PREDICATE_NORMAL;
1461
1462 this->result.type = BRW_REGISTER_TYPE_F;
1463 } else {
1464 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1465 * -> non-negative val generates 0x00000000.
1466 * Predicated OR sets 1 if val is positive.
1467 */
1468 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1469
1470 emit(ASR(result_dst, op[0], src_reg(31)));
1471
1472 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1473 inst->predicate = BRW_PREDICATE_NORMAL;
1474 }
1475 break;
1476
1477 case ir_unop_rcp:
1478 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1479 break;
1480
1481 case ir_unop_exp2:
1482 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1483 break;
1484 case ir_unop_log2:
1485 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1486 break;
1487 case ir_unop_exp:
1488 case ir_unop_log:
1489 unreachable("not reached: should be handled by ir_explog_to_explog2");
1490 case ir_unop_sin:
1491 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1492 break;
1493 case ir_unop_cos:
1494 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1495 break;
1496
1497 case ir_unop_dFdx:
1498 case ir_unop_dFdx_coarse:
1499 case ir_unop_dFdx_fine:
1500 case ir_unop_dFdy:
1501 case ir_unop_dFdy_coarse:
1502 case ir_unop_dFdy_fine:
1503 unreachable("derivatives not valid in vertex shader");
1504
1505 case ir_unop_bitfield_reverse:
1506 emit(BFREV(result_dst, op[0]));
1507 break;
1508 case ir_unop_bit_count:
1509 emit(CBIT(result_dst, op[0]));
1510 break;
1511 case ir_unop_find_msb: {
1512 src_reg temp = src_reg(this, glsl_type::uint_type);
1513
1514 inst = emit(FBH(dst_reg(temp), op[0]));
1515 inst->dst.writemask = WRITEMASK_XYZW;
1516
1517 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1518 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1519 * subtract the result from 31 to convert the MSB count into an LSB count.
1520 */
1521
1522 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1523 temp.swizzle = BRW_SWIZZLE_NOOP;
1524 emit(MOV(result_dst, temp));
1525
1526 src_reg src_tmp = src_reg(result_dst);
1527 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1528
1529 src_tmp.negate = true;
1530 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1531 inst->predicate = BRW_PREDICATE_NORMAL;
1532 break;
1533 }
1534 case ir_unop_find_lsb:
1535 emit(FBL(result_dst, op[0]));
1536 break;
1537 case ir_unop_saturate:
1538 inst = emit(MOV(result_dst, op[0]));
1539 inst->saturate = true;
1540 break;
1541
1542 case ir_unop_noise:
1543 unreachable("not reached: should be handled by lower_noise");
1544
1545 case ir_binop_add:
1546 emit(ADD(result_dst, op[0], op[1]));
1547 break;
1548 case ir_binop_sub:
1549 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1550
1551 case ir_binop_mul:
1552 if (devinfo->gen < 8 && ir->type->is_integer()) {
1553 /* For integer multiplication, the MUL uses the low 16 bits of one of
1554 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1555 * accumulates in the contribution of the upper 16 bits of that
1556 * operand. If we can determine that one of the args is in the low
1557 * 16 bits, though, we can just emit a single MUL.
1558 */
1559 if (ir->operands[0]->is_uint16_constant()) {
1560 if (devinfo->gen < 7)
1561 emit(MUL(result_dst, op[0], op[1]));
1562 else
1563 emit(MUL(result_dst, op[1], op[0]));
1564 } else if (ir->operands[1]->is_uint16_constant()) {
1565 if (devinfo->gen < 7)
1566 emit(MUL(result_dst, op[1], op[0]));
1567 else
1568 emit(MUL(result_dst, op[0], op[1]));
1569 } else {
1570 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1571
1572 emit(MUL(acc, op[0], op[1]));
1573 emit(MACH(dst_null_d(), op[0], op[1]));
1574 emit(MOV(result_dst, src_reg(acc)));
1575 }
1576 } else {
1577 emit(MUL(result_dst, op[0], op[1]));
1578 }
1579 break;
1580 case ir_binop_imul_high: {
1581 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1582
1583 emit(MUL(acc, op[0], op[1]));
1584 emit(MACH(result_dst, op[0], op[1]));
1585 break;
1586 }
1587 case ir_binop_div:
1588 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1589 assert(ir->type->is_integer());
1590 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1591 break;
1592 case ir_binop_carry: {
1593 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1594
1595 emit(ADDC(dst_null_ud(), op[0], op[1]));
1596 emit(MOV(result_dst, src_reg(acc)));
1597 break;
1598 }
1599 case ir_binop_borrow: {
1600 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1601
1602 emit(SUBB(dst_null_ud(), op[0], op[1]));
1603 emit(MOV(result_dst, src_reg(acc)));
1604 break;
1605 }
1606 case ir_binop_mod:
1607 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1608 assert(ir->type->is_integer());
1609 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1610 break;
1611
1612 case ir_binop_less:
1613 case ir_binop_greater:
1614 case ir_binop_lequal:
1615 case ir_binop_gequal:
1616 case ir_binop_equal:
1617 case ir_binop_nequal: {
1618 if (devinfo->gen <= 5) {
1619 resolve_bool_comparison(ir->operands[0], &op[0]);
1620 resolve_bool_comparison(ir->operands[1], &op[1]);
1621 }
1622 emit(CMP(result_dst, op[0], op[1],
1623 brw_conditional_for_comparison(ir->operation)));
1624 break;
1625 }
1626
1627 case ir_binop_all_equal:
1628 if (devinfo->gen <= 5) {
1629 resolve_bool_comparison(ir->operands[0], &op[0]);
1630 resolve_bool_comparison(ir->operands[1], &op[1]);
1631 }
1632
1633 /* "==" operator producing a scalar boolean. */
1634 if (ir->operands[0]->type->is_vector() ||
1635 ir->operands[1]->type->is_vector()) {
1636 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1637 emit(MOV(result_dst, src_reg(0)));
1638 inst = emit(MOV(result_dst, src_reg(~0)));
1639 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1640 } else {
1641 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1642 }
1643 break;
1644 case ir_binop_any_nequal:
1645 if (devinfo->gen <= 5) {
1646 resolve_bool_comparison(ir->operands[0], &op[0]);
1647 resolve_bool_comparison(ir->operands[1], &op[1]);
1648 }
1649
1650 /* "!=" operator producing a scalar boolean. */
1651 if (ir->operands[0]->type->is_vector() ||
1652 ir->operands[1]->type->is_vector()) {
1653 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1654
1655 emit(MOV(result_dst, src_reg(0)));
1656 inst = emit(MOV(result_dst, src_reg(~0)));
1657 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1658 } else {
1659 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1660 }
1661 break;
1662
1663 case ir_unop_any:
1664 if (devinfo->gen <= 5) {
1665 resolve_bool_comparison(ir->operands[0], &op[0]);
1666 }
1667 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1668 emit(MOV(result_dst, src_reg(0)));
1669
1670 inst = emit(MOV(result_dst, src_reg(~0)));
1671 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1672 break;
1673
1674 case ir_binop_logic_xor:
1675 emit(XOR(result_dst, op[0], op[1]));
1676 break;
1677
1678 case ir_binop_logic_or:
1679 emit(OR(result_dst, op[0], op[1]));
1680 break;
1681
1682 case ir_binop_logic_and:
1683 emit(AND(result_dst, op[0], op[1]));
1684 break;
1685
1686 case ir_binop_dot:
1687 assert(ir->operands[0]->type->is_vector());
1688 assert(ir->operands[0]->type == ir->operands[1]->type);
1689 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1690 break;
1691
1692 case ir_unop_sqrt:
1693 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1694 break;
1695 case ir_unop_rsq:
1696 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1697 break;
1698
1699 case ir_unop_bitcast_i2f:
1700 case ir_unop_bitcast_u2f:
1701 this->result = op[0];
1702 this->result.type = BRW_REGISTER_TYPE_F;
1703 break;
1704
1705 case ir_unop_bitcast_f2i:
1706 this->result = op[0];
1707 this->result.type = BRW_REGISTER_TYPE_D;
1708 break;
1709
1710 case ir_unop_bitcast_f2u:
1711 this->result = op[0];
1712 this->result.type = BRW_REGISTER_TYPE_UD;
1713 break;
1714
1715 case ir_unop_i2f:
1716 case ir_unop_i2u:
1717 case ir_unop_u2i:
1718 case ir_unop_u2f:
1719 case ir_unop_f2i:
1720 case ir_unop_f2u:
1721 emit(MOV(result_dst, op[0]));
1722 break;
1723 case ir_unop_b2i:
1724 emit(AND(result_dst, op[0], src_reg(1)));
1725 break;
1726 case ir_unop_b2f:
1727 if (devinfo->gen <= 5) {
1728 resolve_bool_comparison(ir->operands[0], &op[0]);
1729 }
1730 op[0].type = BRW_REGISTER_TYPE_D;
1731 result_dst.type = BRW_REGISTER_TYPE_D;
1732 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1733 result_dst.type = BRW_REGISTER_TYPE_F;
1734 break;
1735 case ir_unop_f2b:
1736 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1737 break;
1738 case ir_unop_i2b:
1739 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1740 break;
1741
1742 case ir_unop_trunc:
1743 emit(RNDZ(result_dst, op[0]));
1744 break;
1745 case ir_unop_ceil: {
1746 src_reg tmp = src_reg(this, ir->type);
1747 op[0].negate = !op[0].negate;
1748 emit(RNDD(dst_reg(tmp), op[0]));
1749 tmp.negate = true;
1750 emit(MOV(result_dst, tmp));
1751 }
1752 break;
1753 case ir_unop_floor:
1754 inst = emit(RNDD(result_dst, op[0]));
1755 break;
1756 case ir_unop_fract:
1757 inst = emit(FRC(result_dst, op[0]));
1758 break;
1759 case ir_unop_round_even:
1760 emit(RNDE(result_dst, op[0]));
1761 break;
1762
1763 case ir_binop_min:
1764 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1765 break;
1766 case ir_binop_max:
1767 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1768 break;
1769
1770 case ir_binop_pow:
1771 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1772 break;
1773
1774 case ir_unop_bit_not:
1775 inst = emit(NOT(result_dst, op[0]));
1776 break;
1777 case ir_binop_bit_and:
1778 inst = emit(AND(result_dst, op[0], op[1]));
1779 break;
1780 case ir_binop_bit_xor:
1781 inst = emit(XOR(result_dst, op[0], op[1]));
1782 break;
1783 case ir_binop_bit_or:
1784 inst = emit(OR(result_dst, op[0], op[1]));
1785 break;
1786
1787 case ir_binop_lshift:
1788 inst = emit(SHL(result_dst, op[0], op[1]));
1789 break;
1790
1791 case ir_binop_rshift:
1792 if (ir->type->base_type == GLSL_TYPE_INT)
1793 inst = emit(ASR(result_dst, op[0], op[1]));
1794 else
1795 inst = emit(SHR(result_dst, op[0], op[1]));
1796 break;
1797
1798 case ir_binop_bfm:
1799 emit(BFI1(result_dst, op[0], op[1]));
1800 break;
1801
1802 case ir_binop_ubo_load: {
1803 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1804 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1805 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1806 src_reg offset;
1807
1808 /* Now, load the vector from that offset. */
1809 assert(ir->type->is_vector() || ir->type->is_scalar());
1810
1811 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1812 packed_consts.type = result.type;
1813 src_reg surf_index;
1814
1815 if (const_uniform_block) {
1816 /* The block index is a constant, so just emit the binding table entry
1817 * as an immediate.
1818 */
1819 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1820 const_uniform_block->value.u[0]);
1821 } else {
1822 /* The block index is not a constant. Evaluate the index expression
1823 * per-channel and add the base UBO index; the generator will select
1824 * a value from any live channel.
1825 */
1826 surf_index = src_reg(this, glsl_type::uint_type);
1827 emit(ADD(dst_reg(surf_index), op[0],
1828 src_reg(prog_data->base.binding_table.ubo_start)));
1829
1830 /* Assume this may touch any UBO. It would be nice to provide
1831 * a tighter bound, but the array information is already lowered away.
1832 */
1833 brw_mark_surface_used(&prog_data->base,
1834 prog_data->base.binding_table.ubo_start +
1835 shader_prog->NumUniformBlocks - 1);
1836 }
1837
1838 if (const_offset_ir) {
1839 if (devinfo->gen >= 8) {
1840 /* Store the offset in a GRF so we can send-from-GRF. */
1841 offset = src_reg(this, glsl_type::int_type);
1842 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1843 } else {
1844 /* Immediates are fine on older generations since they'll be moved
1845 * to a (potentially fake) MRF at the generator level.
1846 */
1847 offset = src_reg(const_offset / 16);
1848 }
1849 } else {
1850 offset = src_reg(this, glsl_type::uint_type);
1851 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1852 }
1853
1854 emit_pull_constant_load_reg(dst_reg(packed_consts),
1855 surf_index,
1856 offset,
1857 NULL, NULL /* before_block/inst */);
1858
1859 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1860 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1861 const_offset % 16 / 4,
1862 const_offset % 16 / 4,
1863 const_offset % 16 / 4);
1864
1865 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1866 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1867 emit(CMP(result_dst, packed_consts, src_reg(0u),
1868 BRW_CONDITIONAL_NZ));
1869 } else {
1870 emit(MOV(result_dst, packed_consts));
1871 }
1872 break;
1873 }
1874
1875 case ir_binop_vector_extract:
1876 unreachable("should have been lowered by vec_index_to_cond_assign");
1877
1878 case ir_triop_fma:
1879 op[0] = fix_3src_operand(op[0]);
1880 op[1] = fix_3src_operand(op[1]);
1881 op[2] = fix_3src_operand(op[2]);
1882 /* Note that the instruction's argument order is reversed from GLSL
1883 * and the IR.
1884 */
1885 emit(MAD(result_dst, op[2], op[1], op[0]));
1886 break;
1887
1888 case ir_triop_lrp:
1889 emit_lrp(result_dst, op[0], op[1], op[2]);
1890 break;
1891
1892 case ir_triop_csel:
1893 unreachable("already handled above");
1894 break;
1895
1896 case ir_triop_bfi:
1897 op[0] = fix_3src_operand(op[0]);
1898 op[1] = fix_3src_operand(op[1]);
1899 op[2] = fix_3src_operand(op[2]);
1900 emit(BFI2(result_dst, op[0], op[1], op[2]));
1901 break;
1902
1903 case ir_triop_bitfield_extract:
1904 op[0] = fix_3src_operand(op[0]);
1905 op[1] = fix_3src_operand(op[1]);
1906 op[2] = fix_3src_operand(op[2]);
1907 /* Note that the instruction's argument order is reversed from GLSL
1908 * and the IR.
1909 */
1910 emit(BFE(result_dst, op[2], op[1], op[0]));
1911 break;
1912
1913 case ir_triop_vector_insert:
1914 unreachable("should have been lowered by lower_vector_insert");
1915
1916 case ir_quadop_bitfield_insert:
1917 unreachable("not reached: should be handled by "
1918 "bitfield_insert_to_bfm_bfi\n");
1919
1920 case ir_quadop_vector:
1921 unreachable("not reached: should be handled by lower_quadop_vector");
1922
1923 case ir_unop_pack_half_2x16:
1924 emit_pack_half_2x16(result_dst, op[0]);
1925 break;
1926 case ir_unop_unpack_half_2x16:
1927 emit_unpack_half_2x16(result_dst, op[0]);
1928 break;
1929 case ir_unop_unpack_unorm_4x8:
1930 emit_unpack_unorm_4x8(result_dst, op[0]);
1931 break;
1932 case ir_unop_unpack_snorm_4x8:
1933 emit_unpack_snorm_4x8(result_dst, op[0]);
1934 break;
1935 case ir_unop_pack_unorm_4x8:
1936 emit_pack_unorm_4x8(result_dst, op[0]);
1937 break;
1938 case ir_unop_pack_snorm_4x8:
1939 emit_pack_snorm_4x8(result_dst, op[0]);
1940 break;
1941 case ir_unop_pack_snorm_2x16:
1942 case ir_unop_pack_unorm_2x16:
1943 case ir_unop_unpack_snorm_2x16:
1944 case ir_unop_unpack_unorm_2x16:
1945 unreachable("not reached: should be handled by lower_packing_builtins");
1946 case ir_unop_unpack_half_2x16_split_x:
1947 case ir_unop_unpack_half_2x16_split_y:
1948 case ir_binop_pack_half_2x16_split:
1949 case ir_unop_interpolate_at_centroid:
1950 case ir_binop_interpolate_at_sample:
1951 case ir_binop_interpolate_at_offset:
1952 unreachable("not reached: should not occur in vertex shader");
1953 case ir_binop_ldexp:
1954 unreachable("not reached: should be handled by ldexp_to_arith()");
1955 case ir_unop_d2f:
1956 case ir_unop_f2d:
1957 case ir_unop_d2i:
1958 case ir_unop_i2d:
1959 case ir_unop_d2u:
1960 case ir_unop_u2d:
1961 case ir_unop_d2b:
1962 case ir_unop_pack_double_2x32:
1963 case ir_unop_unpack_double_2x32:
1964 case ir_unop_frexp_sig:
1965 case ir_unop_frexp_exp:
1966 unreachable("fp64 todo");
1967 }
1968 }
1969
1970
1971 void
1972 vec4_visitor::visit(ir_swizzle *ir)
1973 {
1974 /* Note that this is only swizzles in expressions, not those on the left
1975 * hand side of an assignment, which do write masking. See ir_assignment
1976 * for that.
1977 */
1978 const unsigned swz = brw_compose_swizzle(
1979 brw_swizzle_for_size(ir->type->vector_elements),
1980 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1981
1982 ir->val->accept(this);
1983 this->result = swizzle(this->result, swz);
1984 }
1985
1986 void
1987 vec4_visitor::visit(ir_dereference_variable *ir)
1988 {
1989 const struct glsl_type *type = ir->type;
1990 dst_reg *reg = variable_storage(ir->var);
1991
1992 if (!reg) {
1993 fail("Failed to find variable storage for %s\n", ir->var->name);
1994 this->result = src_reg(brw_null_reg());
1995 return;
1996 }
1997
1998 this->result = src_reg(*reg);
1999
2000 /* System values get their swizzle from the dst_reg writemask */
2001 if (ir->var->data.mode == ir_var_system_value)
2002 return;
2003
2004 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2005 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2006 }
2007
2008
2009 int
2010 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2011 {
2012 /* Under normal circumstances array elements are stored consecutively, so
2013 * the stride is equal to the size of the array element.
2014 */
2015 return type_size(ir->type);
2016 }
2017
2018
2019 void
2020 vec4_visitor::visit(ir_dereference_array *ir)
2021 {
2022 ir_constant *constant_index;
2023 src_reg src;
2024 int array_stride = compute_array_stride(ir);
2025
2026 constant_index = ir->array_index->constant_expression_value();
2027
2028 ir->array->accept(this);
2029 src = this->result;
2030
2031 if (constant_index) {
2032 src.reg_offset += constant_index->value.i[0] * array_stride;
2033 } else {
2034 /* Variable index array dereference. It eats the "vec4" of the
2035 * base of the array and an index that offsets the Mesa register
2036 * index.
2037 */
2038 ir->array_index->accept(this);
2039
2040 src_reg index_reg;
2041
2042 if (array_stride == 1) {
2043 index_reg = this->result;
2044 } else {
2045 index_reg = src_reg(this, glsl_type::int_type);
2046
2047 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2048 }
2049
2050 if (src.reladdr) {
2051 src_reg temp = src_reg(this, glsl_type::int_type);
2052
2053 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2054
2055 index_reg = temp;
2056 }
2057
2058 src.reladdr = ralloc(mem_ctx, src_reg);
2059 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2060 }
2061
2062 /* If the type is smaller than a vec4, replicate the last channel out. */
2063 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2064 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2065 else
2066 src.swizzle = BRW_SWIZZLE_NOOP;
2067 src.type = brw_type_for_base_type(ir->type);
2068
2069 this->result = src;
2070 }
2071
2072 void
2073 vec4_visitor::visit(ir_dereference_record *ir)
2074 {
2075 unsigned int i;
2076 const glsl_type *struct_type = ir->record->type;
2077 int offset = 0;
2078
2079 ir->record->accept(this);
2080
2081 for (i = 0; i < struct_type->length; i++) {
2082 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2083 break;
2084 offset += type_size(struct_type->fields.structure[i].type);
2085 }
2086
2087 /* If the type is smaller than a vec4, replicate the last channel out. */
2088 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2089 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2090 else
2091 this->result.swizzle = BRW_SWIZZLE_NOOP;
2092 this->result.type = brw_type_for_base_type(ir->type);
2093
2094 this->result.reg_offset += offset;
2095 }
2096
2097 /**
2098 * We want to be careful in assignment setup to hit the actual storage
2099 * instead of potentially using a temporary like we might with the
2100 * ir_dereference handler.
2101 */
2102 static dst_reg
2103 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2104 {
2105 /* The LHS must be a dereference. If the LHS is a variable indexed array
2106 * access of a vector, it must be separated into a series conditional moves
2107 * before reaching this point (see ir_vec_index_to_cond_assign).
2108 */
2109 assert(ir->as_dereference());
2110 ir_dereference_array *deref_array = ir->as_dereference_array();
2111 if (deref_array) {
2112 assert(!deref_array->array->type->is_vector());
2113 }
2114
2115 /* Use the rvalue deref handler for the most part. We'll ignore
2116 * swizzles in it and write swizzles using writemask, though.
2117 */
2118 ir->accept(v);
2119 return dst_reg(v->result);
2120 }
2121
2122 void
2123 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2124 const struct glsl_type *type,
2125 enum brw_predicate predicate)
2126 {
2127 if (type->base_type == GLSL_TYPE_STRUCT) {
2128 for (unsigned int i = 0; i < type->length; i++) {
2129 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2130 }
2131 return;
2132 }
2133
2134 if (type->is_array()) {
2135 for (unsigned int i = 0; i < type->length; i++) {
2136 emit_block_move(dst, src, type->fields.array, predicate);
2137 }
2138 return;
2139 }
2140
2141 if (type->is_matrix()) {
2142 const struct glsl_type *vec_type;
2143
2144 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2145 type->vector_elements, 1);
2146
2147 for (int i = 0; i < type->matrix_columns; i++) {
2148 emit_block_move(dst, src, vec_type, predicate);
2149 }
2150 return;
2151 }
2152
2153 assert(type->is_scalar() || type->is_vector());
2154
2155 dst->type = brw_type_for_base_type(type);
2156 src->type = dst->type;
2157
2158 dst->writemask = (1 << type->vector_elements) - 1;
2159
2160 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2161
2162 vec4_instruction *inst = emit(MOV(*dst, *src));
2163 inst->predicate = predicate;
2164
2165 dst->reg_offset++;
2166 src->reg_offset++;
2167 }
2168
2169
2170 /* If the RHS processing resulted in an instruction generating a
2171 * temporary value, and it would be easy to rewrite the instruction to
2172 * generate its result right into the LHS instead, do so. This ends
2173 * up reliably removing instructions where it can be tricky to do so
2174 * later without real UD chain information.
2175 */
2176 bool
2177 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2178 dst_reg dst,
2179 src_reg src,
2180 vec4_instruction *pre_rhs_inst,
2181 vec4_instruction *last_rhs_inst)
2182 {
2183 /* This could be supported, but it would take more smarts. */
2184 if (ir->condition)
2185 return false;
2186
2187 if (pre_rhs_inst == last_rhs_inst)
2188 return false; /* No instructions generated to work with. */
2189
2190 /* Make sure the last instruction generated our source reg. */
2191 if (src.file != GRF ||
2192 src.file != last_rhs_inst->dst.file ||
2193 src.reg != last_rhs_inst->dst.reg ||
2194 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2195 src.reladdr ||
2196 src.abs ||
2197 src.negate ||
2198 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2199 return false;
2200
2201 /* Check that that last instruction fully initialized the channels
2202 * we want to use, in the order we want to use them. We could
2203 * potentially reswizzle the operands of many instructions so that
2204 * we could handle out of order channels, but don't yet.
2205 */
2206
2207 for (unsigned i = 0; i < 4; i++) {
2208 if (dst.writemask & (1 << i)) {
2209 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2210 return false;
2211
2212 if (BRW_GET_SWZ(src.swizzle, i) != i)
2213 return false;
2214 }
2215 }
2216
2217 /* Success! Rewrite the instruction. */
2218 last_rhs_inst->dst.file = dst.file;
2219 last_rhs_inst->dst.reg = dst.reg;
2220 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2221 last_rhs_inst->dst.reladdr = dst.reladdr;
2222 last_rhs_inst->dst.writemask &= dst.writemask;
2223
2224 return true;
2225 }
2226
2227 void
2228 vec4_visitor::visit(ir_assignment *ir)
2229 {
2230 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2231 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2232
2233 if (!ir->lhs->type->is_scalar() &&
2234 !ir->lhs->type->is_vector()) {
2235 ir->rhs->accept(this);
2236 src_reg src = this->result;
2237
2238 if (ir->condition) {
2239 emit_bool_to_cond_code(ir->condition, &predicate);
2240 }
2241
2242 /* emit_block_move doesn't account for swizzles in the source register.
2243 * This should be ok, since the source register is a structure or an
2244 * array, and those can't be swizzled. But double-check to be sure.
2245 */
2246 assert(src.swizzle ==
2247 (ir->rhs->type->is_matrix()
2248 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2249 : BRW_SWIZZLE_NOOP));
2250
2251 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2252 return;
2253 }
2254
2255 /* Now we're down to just a scalar/vector with writemasks. */
2256 int i;
2257
2258 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2259 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2260
2261 ir->rhs->accept(this);
2262
2263 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2264
2265 int swizzles[4];
2266 int src_chan = 0;
2267
2268 assert(ir->lhs->type->is_vector() ||
2269 ir->lhs->type->is_scalar());
2270 dst.writemask = ir->write_mask;
2271
2272 /* Swizzle a small RHS vector into the channels being written.
2273 *
2274 * glsl ir treats write_mask as dictating how many channels are
2275 * present on the RHS while in our instructions we need to make
2276 * those channels appear in the slots of the vec4 they're written to.
2277 */
2278 for (int i = 0; i < 4; i++)
2279 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2280
2281 src_reg src = swizzle(this->result,
2282 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2283 swizzles[2], swizzles[3]));
2284
2285 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2286 return;
2287 }
2288
2289 if (ir->condition) {
2290 emit_bool_to_cond_code(ir->condition, &predicate);
2291 }
2292
2293 for (i = 0; i < type_size(ir->lhs->type); i++) {
2294 vec4_instruction *inst = emit(MOV(dst, src));
2295 inst->predicate = predicate;
2296
2297 dst.reg_offset++;
2298 src.reg_offset++;
2299 }
2300 }
2301
2302 void
2303 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2304 {
2305 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2306 foreach_in_list(ir_constant, field_value, &ir->components) {
2307 emit_constant_values(dst, field_value);
2308 }
2309 return;
2310 }
2311
2312 if (ir->type->is_array()) {
2313 for (unsigned int i = 0; i < ir->type->length; i++) {
2314 emit_constant_values(dst, ir->array_elements[i]);
2315 }
2316 return;
2317 }
2318
2319 if (ir->type->is_matrix()) {
2320 for (int i = 0; i < ir->type->matrix_columns; i++) {
2321 float *vec = &ir->value.f[i * ir->type->vector_elements];
2322
2323 for (int j = 0; j < ir->type->vector_elements; j++) {
2324 dst->writemask = 1 << j;
2325 dst->type = BRW_REGISTER_TYPE_F;
2326
2327 emit(MOV(*dst, src_reg(vec[j])));
2328 }
2329 dst->reg_offset++;
2330 }
2331 return;
2332 }
2333
2334 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2335
2336 for (int i = 0; i < ir->type->vector_elements; i++) {
2337 if (!(remaining_writemask & (1 << i)))
2338 continue;
2339
2340 dst->writemask = 1 << i;
2341 dst->type = brw_type_for_base_type(ir->type);
2342
2343 /* Find other components that match the one we're about to
2344 * write. Emits fewer instructions for things like vec4(0.5,
2345 * 1.5, 1.5, 1.5).
2346 */
2347 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2348 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2349 if (ir->value.b[i] == ir->value.b[j])
2350 dst->writemask |= (1 << j);
2351 } else {
2352 /* u, i, and f storage all line up, so no need for a
2353 * switch case for comparing each type.
2354 */
2355 if (ir->value.u[i] == ir->value.u[j])
2356 dst->writemask |= (1 << j);
2357 }
2358 }
2359
2360 switch (ir->type->base_type) {
2361 case GLSL_TYPE_FLOAT:
2362 emit(MOV(*dst, src_reg(ir->value.f[i])));
2363 break;
2364 case GLSL_TYPE_INT:
2365 emit(MOV(*dst, src_reg(ir->value.i[i])));
2366 break;
2367 case GLSL_TYPE_UINT:
2368 emit(MOV(*dst, src_reg(ir->value.u[i])));
2369 break;
2370 case GLSL_TYPE_BOOL:
2371 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2372 break;
2373 default:
2374 unreachable("Non-float/uint/int/bool constant");
2375 }
2376
2377 remaining_writemask &= ~dst->writemask;
2378 }
2379 dst->reg_offset++;
2380 }
2381
2382 void
2383 vec4_visitor::visit(ir_constant *ir)
2384 {
2385 dst_reg dst = dst_reg(this, ir->type);
2386 this->result = src_reg(dst);
2387
2388 emit_constant_values(&dst, ir);
2389 }
2390
2391 void
2392 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2393 {
2394 ir_dereference *deref = static_cast<ir_dereference *>(
2395 ir->actual_parameters.get_head());
2396 ir_variable *location = deref->variable_referenced();
2397 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2398 location->data.binding);
2399
2400 /* Calculate the surface offset */
2401 src_reg offset(this, glsl_type::uint_type);
2402 ir_dereference_array *deref_array = deref->as_dereference_array();
2403 if (deref_array) {
2404 deref_array->array_index->accept(this);
2405
2406 src_reg tmp(this, glsl_type::uint_type);
2407 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2408 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2409 } else {
2410 offset = location->data.atomic.offset;
2411 }
2412
2413 /* Emit the appropriate machine instruction */
2414 const char *callee = ir->callee->function_name();
2415 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2416
2417 if (!strcmp("__intrinsic_atomic_read", callee)) {
2418 emit_untyped_surface_read(surf_index, dst, offset);
2419
2420 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2421 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2422 src_reg(), src_reg());
2423
2424 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2425 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2426 src_reg(), src_reg());
2427 }
2428 }
2429
2430 void
2431 vec4_visitor::visit(ir_call *ir)
2432 {
2433 const char *callee = ir->callee->function_name();
2434
2435 if (!strcmp("__intrinsic_atomic_read", callee) ||
2436 !strcmp("__intrinsic_atomic_increment", callee) ||
2437 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2438 visit_atomic_counter_intrinsic(ir);
2439 } else {
2440 unreachable("Unsupported intrinsic.");
2441 }
2442 }
2443
2444 src_reg
2445 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2446 {
2447 vec4_instruction *inst =
2448 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2449 dst_reg(this, glsl_type::uvec4_type));
2450 inst->base_mrf = 2;
2451 inst->mlen = 1;
2452 inst->src[1] = sampler;
2453
2454 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2455 int param_base = inst->base_mrf;
2456 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2457 int zero_mask = 0xf & ~coord_mask;
2458
2459 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2460 coordinate));
2461
2462 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2463 src_reg(0)));
2464
2465 emit(inst);
2466 return src_reg(inst->dst);
2467 }
2468
2469 static bool
2470 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2471 {
2472 if (devinfo->gen < 8 && !devinfo->is_haswell)
2473 return false;
2474
2475 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2476 }
2477
2478 void
2479 vec4_visitor::visit(ir_texture *ir)
2480 {
2481 uint32_t sampler =
2482 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2483
2484 ir_rvalue *nonconst_sampler_index =
2485 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2486
2487 /* Handle non-constant sampler array indexing */
2488 src_reg sampler_reg;
2489 if (nonconst_sampler_index) {
2490 /* The highest sampler which may be used by this operation is
2491 * the last element of the array. Mark it here, because the generator
2492 * doesn't have enough information to determine the bound.
2493 */
2494 uint32_t array_size = ir->sampler->as_dereference_array()
2495 ->array->type->array_size();
2496
2497 uint32_t max_used = sampler + array_size - 1;
2498 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2499 max_used += prog_data->base.binding_table.gather_texture_start;
2500 } else {
2501 max_used += prog_data->base.binding_table.texture_start;
2502 }
2503
2504 brw_mark_surface_used(&prog_data->base, max_used);
2505
2506 /* Emit code to evaluate the actual indexing expression */
2507 nonconst_sampler_index->accept(this);
2508 dst_reg temp(this, glsl_type::uint_type);
2509 emit(ADD(temp, this->result, src_reg(sampler)))
2510 ->force_writemask_all = true;
2511 sampler_reg = src_reg(temp);
2512 } else {
2513 /* Single sampler, or constant array index; the indexing expression
2514 * is just an immediate.
2515 */
2516 sampler_reg = src_reg(sampler);
2517 }
2518
2519 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2520 * emitting anything other than setting up the constant result.
2521 */
2522 if (ir->op == ir_tg4) {
2523 ir_constant *chan = ir->lod_info.component->as_constant();
2524 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2525 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2526 dst_reg result(this, ir->type);
2527 this->result = src_reg(result);
2528 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2529 return;
2530 }
2531 }
2532
2533 /* Should be lowered by do_lower_texture_projection */
2534 assert(!ir->projector);
2535
2536 /* Should be lowered */
2537 assert(!ir->offset || !ir->offset->type->is_array());
2538
2539 /* Generate code to compute all the subexpression trees. This has to be
2540 * done before loading any values into MRFs for the sampler message since
2541 * generating these values may involve SEND messages that need the MRFs.
2542 */
2543 src_reg coordinate;
2544 if (ir->coordinate) {
2545 ir->coordinate->accept(this);
2546 coordinate = this->result;
2547 }
2548
2549 src_reg shadow_comparitor;
2550 if (ir->shadow_comparitor) {
2551 ir->shadow_comparitor->accept(this);
2552 shadow_comparitor = this->result;
2553 }
2554
2555 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2556 src_reg offset_value;
2557 if (has_nonconstant_offset) {
2558 ir->offset->accept(this);
2559 offset_value = src_reg(this->result);
2560 }
2561
2562 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2563 src_reg lod, dPdx, dPdy, sample_index, mcs;
2564 switch (ir->op) {
2565 case ir_tex:
2566 lod = src_reg(0.0f);
2567 lod_type = glsl_type::float_type;
2568 break;
2569 case ir_txf:
2570 case ir_txl:
2571 case ir_txs:
2572 ir->lod_info.lod->accept(this);
2573 lod = this->result;
2574 lod_type = ir->lod_info.lod->type;
2575 break;
2576 case ir_query_levels:
2577 lod = src_reg(0);
2578 lod_type = glsl_type::int_type;
2579 break;
2580 case ir_txf_ms:
2581 ir->lod_info.sample_index->accept(this);
2582 sample_index = this->result;
2583 sample_index_type = ir->lod_info.sample_index->type;
2584
2585 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2586 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2587 else
2588 mcs = src_reg(0u);
2589 break;
2590 case ir_txd:
2591 ir->lod_info.grad.dPdx->accept(this);
2592 dPdx = this->result;
2593
2594 ir->lod_info.grad.dPdy->accept(this);
2595 dPdy = this->result;
2596
2597 lod_type = ir->lod_info.grad.dPdx->type;
2598 break;
2599 case ir_txb:
2600 case ir_lod:
2601 case ir_tg4:
2602 break;
2603 }
2604
2605 enum opcode opcode;
2606 switch (ir->op) {
2607 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2608 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2609 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2610 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2611 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2612 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2613 case ir_tg4: opcode = has_nonconstant_offset
2614 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2615 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2616 case ir_txb:
2617 unreachable("TXB is not valid for vertex shaders.");
2618 case ir_lod:
2619 unreachable("LOD is not valid for vertex shaders.");
2620 default:
2621 unreachable("Unrecognized tex op");
2622 }
2623
2624 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2625 opcode, dst_reg(this, ir->type));
2626
2627 if (ir->offset != NULL && !has_nonconstant_offset) {
2628 inst->offset =
2629 brw_texture_offset(ir->offset->as_constant()->value.i,
2630 ir->offset->type->vector_elements);
2631 }
2632
2633 /* Stuff the channel select bits in the top of the texture offset */
2634 if (ir->op == ir_tg4)
2635 inst->offset |= gather_channel(ir, sampler) << 16;
2636
2637 /* The message header is necessary for:
2638 * - Gen4 (always)
2639 * - Gen9+ for selecting SIMD4x2
2640 * - Texel offsets
2641 * - Gather channel selection
2642 * - Sampler indices too large to fit in a 4-bit value.
2643 */
2644 inst->header_present =
2645 devinfo->gen < 5 || devinfo->gen >= 9 ||
2646 inst->offset != 0 || ir->op == ir_tg4 ||
2647 is_high_sampler(devinfo, sampler_reg);
2648 inst->base_mrf = 2;
2649 inst->mlen = inst->header_present + 1; /* always at least one */
2650 inst->dst.writemask = WRITEMASK_XYZW;
2651 inst->shadow_compare = ir->shadow_comparitor != NULL;
2652
2653 inst->src[1] = sampler_reg;
2654
2655 /* MRF for the first parameter */
2656 int param_base = inst->base_mrf + inst->header_present;
2657
2658 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2659 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2660 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2661 } else {
2662 /* Load the coordinate */
2663 /* FINISHME: gl_clamp_mask and saturate */
2664 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2665 int zero_mask = 0xf & ~coord_mask;
2666
2667 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2668 coordinate));
2669
2670 if (zero_mask != 0) {
2671 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2672 src_reg(0)));
2673 }
2674 /* Load the shadow comparitor */
2675 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2676 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2677 WRITEMASK_X),
2678 shadow_comparitor));
2679 inst->mlen++;
2680 }
2681
2682 /* Load the LOD info */
2683 if (ir->op == ir_tex || ir->op == ir_txl) {
2684 int mrf, writemask;
2685 if (devinfo->gen >= 5) {
2686 mrf = param_base + 1;
2687 if (ir->shadow_comparitor) {
2688 writemask = WRITEMASK_Y;
2689 /* mlen already incremented */
2690 } else {
2691 writemask = WRITEMASK_X;
2692 inst->mlen++;
2693 }
2694 } else /* devinfo->gen == 4 */ {
2695 mrf = param_base;
2696 writemask = WRITEMASK_W;
2697 }
2698 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2699 } else if (ir->op == ir_txf) {
2700 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2701 } else if (ir->op == ir_txf_ms) {
2702 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2703 sample_index));
2704 if (devinfo->gen >= 7) {
2705 /* MCS data is in the first channel of `mcs`, but we need to get it into
2706 * the .y channel of the second vec4 of params, so replicate .x across
2707 * the whole vec4 and then mask off everything except .y
2708 */
2709 mcs.swizzle = BRW_SWIZZLE_XXXX;
2710 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2711 mcs));
2712 }
2713 inst->mlen++;
2714 } else if (ir->op == ir_txd) {
2715 const glsl_type *type = lod_type;
2716
2717 if (devinfo->gen >= 5) {
2718 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2719 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2720 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2721 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2722 inst->mlen++;
2723
2724 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2725 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2726 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2727 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2728 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2729 inst->mlen++;
2730
2731 if (ir->shadow_comparitor) {
2732 emit(MOV(dst_reg(MRF, param_base + 2,
2733 ir->shadow_comparitor->type, WRITEMASK_Z),
2734 shadow_comparitor));
2735 }
2736 }
2737 } else /* devinfo->gen == 4 */ {
2738 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2739 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2740 inst->mlen += 2;
2741 }
2742 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2743 if (ir->shadow_comparitor) {
2744 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2745 shadow_comparitor));
2746 }
2747
2748 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2749 offset_value));
2750 inst->mlen++;
2751 }
2752 }
2753
2754 emit(inst);
2755
2756 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2757 * spec requires layers.
2758 */
2759 if (ir->op == ir_txs) {
2760 glsl_type const *type = ir->sampler->type;
2761 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2762 type->sampler_array) {
2763 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2764 writemask(inst->dst, WRITEMASK_Z),
2765 src_reg(inst->dst), src_reg(6));
2766 }
2767 }
2768
2769 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2770 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2771 }
2772
2773 swizzle_result(ir, src_reg(inst->dst), sampler);
2774 }
2775
2776 /**
2777 * Apply workarounds for Gen6 gather with UINT/SINT
2778 */
2779 void
2780 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2781 {
2782 if (!wa)
2783 return;
2784
2785 int width = (wa & WA_8BIT) ? 8 : 16;
2786 dst_reg dst_f = dst;
2787 dst_f.type = BRW_REGISTER_TYPE_F;
2788
2789 /* Convert from UNORM to UINT */
2790 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2791 emit(MOV(dst, src_reg(dst_f)));
2792
2793 if (wa & WA_SIGN) {
2794 /* Reinterpret the UINT value as a signed INT value by
2795 * shifting the sign bit into place, then shifting back
2796 * preserving sign.
2797 */
2798 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2799 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2800 }
2801 }
2802
2803 /**
2804 * Set up the gather channel based on the swizzle, for gather4.
2805 */
2806 uint32_t
2807 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2808 {
2809 ir_constant *chan = ir->lod_info.component->as_constant();
2810 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2811 switch (swiz) {
2812 case SWIZZLE_X: return 0;
2813 case SWIZZLE_Y:
2814 /* gather4 sampler is broken for green channel on RG32F --
2815 * we must ask for blue instead.
2816 */
2817 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2818 return 2;
2819 return 1;
2820 case SWIZZLE_Z: return 2;
2821 case SWIZZLE_W: return 3;
2822 default:
2823 unreachable("Not reached"); /* zero, one swizzles handled already */
2824 }
2825 }
2826
2827 void
2828 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2829 {
2830 int s = key->tex.swizzles[sampler];
2831
2832 this->result = src_reg(this, ir->type);
2833 dst_reg swizzled_result(this->result);
2834
2835 if (ir->op == ir_query_levels) {
2836 /* # levels is in .w */
2837 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2838 emit(MOV(swizzled_result, orig_val));
2839 return;
2840 }
2841
2842 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2843 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2844 emit(MOV(swizzled_result, orig_val));
2845 return;
2846 }
2847
2848
2849 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2850 int swizzle[4] = {0};
2851
2852 for (int i = 0; i < 4; i++) {
2853 switch (GET_SWZ(s, i)) {
2854 case SWIZZLE_ZERO:
2855 zero_mask |= (1 << i);
2856 break;
2857 case SWIZZLE_ONE:
2858 one_mask |= (1 << i);
2859 break;
2860 default:
2861 copy_mask |= (1 << i);
2862 swizzle[i] = GET_SWZ(s, i);
2863 break;
2864 }
2865 }
2866
2867 if (copy_mask) {
2868 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2869 swizzled_result.writemask = copy_mask;
2870 emit(MOV(swizzled_result, orig_val));
2871 }
2872
2873 if (zero_mask) {
2874 swizzled_result.writemask = zero_mask;
2875 emit(MOV(swizzled_result, src_reg(0.0f)));
2876 }
2877
2878 if (one_mask) {
2879 swizzled_result.writemask = one_mask;
2880 emit(MOV(swizzled_result, src_reg(1.0f)));
2881 }
2882 }
2883
2884 void
2885 vec4_visitor::visit(ir_return *)
2886 {
2887 unreachable("not reached");
2888 }
2889
2890 void
2891 vec4_visitor::visit(ir_discard *)
2892 {
2893 unreachable("not reached");
2894 }
2895
2896 void
2897 vec4_visitor::visit(ir_if *ir)
2898 {
2899 /* Don't point the annotation at the if statement, because then it plus
2900 * the then and else blocks get printed.
2901 */
2902 this->base_ir = ir->condition;
2903
2904 if (devinfo->gen == 6) {
2905 emit_if_gen6(ir);
2906 } else {
2907 enum brw_predicate predicate;
2908 emit_bool_to_cond_code(ir->condition, &predicate);
2909 emit(IF(predicate));
2910 }
2911
2912 visit_instructions(&ir->then_instructions);
2913
2914 if (!ir->else_instructions.is_empty()) {
2915 this->base_ir = ir->condition;
2916 emit(BRW_OPCODE_ELSE);
2917
2918 visit_instructions(&ir->else_instructions);
2919 }
2920
2921 this->base_ir = ir->condition;
2922 emit(BRW_OPCODE_ENDIF);
2923 }
2924
2925 void
2926 vec4_visitor::visit(ir_emit_vertex *)
2927 {
2928 unreachable("not reached");
2929 }
2930
2931 void
2932 vec4_visitor::visit(ir_end_primitive *)
2933 {
2934 unreachable("not reached");
2935 }
2936
2937 void
2938 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2939 dst_reg dst, src_reg offset,
2940 src_reg src0, src_reg src1)
2941 {
2942 unsigned mlen = 0;
2943
2944 /* Set the atomic operation offset. */
2945 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2946 mlen++;
2947
2948 /* Set the atomic operation arguments. */
2949 if (src0.file != BAD_FILE) {
2950 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2951 mlen++;
2952 }
2953
2954 if (src1.file != BAD_FILE) {
2955 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2956 mlen++;
2957 }
2958
2959 /* Emit the instruction. Note that this maps to the normal SIMD8
2960 * untyped atomic message on Ivy Bridge, but that's OK because
2961 * unused channels will be masked out.
2962 */
2963 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2964 src_reg(atomic_op), src_reg(surf_index));
2965 inst->base_mrf = 0;
2966 inst->mlen = mlen;
2967 }
2968
2969 void
2970 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2971 src_reg offset)
2972 {
2973 /* Set the surface read offset. */
2974 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2975
2976 /* Emit the instruction. Note that this maps to the normal SIMD8
2977 * untyped surface read message, but that's OK because unused
2978 * channels will be masked out.
2979 */
2980 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2981 dst, src_reg(surf_index));
2982 inst->base_mrf = 0;
2983 inst->mlen = 1;
2984 }
2985
2986 void
2987 vec4_visitor::emit_ndc_computation()
2988 {
2989 /* Get the position */
2990 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2991
2992 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2993 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2994 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2995
2996 current_annotation = "NDC";
2997 dst_reg ndc_w = ndc;
2998 ndc_w.writemask = WRITEMASK_W;
2999 src_reg pos_w = pos;
3000 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3001 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3002
3003 dst_reg ndc_xyz = ndc;
3004 ndc_xyz.writemask = WRITEMASK_XYZ;
3005
3006 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3007 }
3008
3009 void
3010 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3011 {
3012 if (devinfo->gen < 6 &&
3013 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3014 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3015 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3016 dst_reg header1_w = header1;
3017 header1_w.writemask = WRITEMASK_W;
3018
3019 emit(MOV(header1, 0u));
3020
3021 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3022 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3023
3024 current_annotation = "Point size";
3025 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3026 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3027 }
3028
3029 if (key->userclip_active) {
3030 current_annotation = "Clipping flags";
3031 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3032 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3033
3034 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3035 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3036 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3037
3038 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3039 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3040 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3041 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3042 }
3043
3044 /* i965 clipping workaround:
3045 * 1) Test for -ve rhw
3046 * 2) If set,
3047 * set ndc = (0,0,0,0)
3048 * set ucp[6] = 1
3049 *
3050 * Later, clipping will detect ucp[6] and ensure the primitive is
3051 * clipped against all fixed planes.
3052 */
3053 if (devinfo->has_negative_rhw_bug) {
3054 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3055 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3056 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3057 vec4_instruction *inst;
3058 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3059 inst->predicate = BRW_PREDICATE_NORMAL;
3060 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3061 inst->predicate = BRW_PREDICATE_NORMAL;
3062 }
3063
3064 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3065 } else if (devinfo->gen < 6) {
3066 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3067 } else {
3068 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3069 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3070 dst_reg reg_w = reg;
3071 reg_w.writemask = WRITEMASK_W;
3072 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3073 }
3074 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3075 dst_reg reg_y = reg;
3076 reg_y.writemask = WRITEMASK_Y;
3077 reg_y.type = BRW_REGISTER_TYPE_D;
3078 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3079 }
3080 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3081 dst_reg reg_z = reg;
3082 reg_z.writemask = WRITEMASK_Z;
3083 reg_z.type = BRW_REGISTER_TYPE_D;
3084 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3085 }
3086 }
3087 }
3088
3089 void
3090 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3091 {
3092 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3093 *
3094 * "If a linked set of shaders forming the vertex stage contains no
3095 * static write to gl_ClipVertex or gl_ClipDistance, but the
3096 * application has requested clipping against user clip planes through
3097 * the API, then the coordinate written to gl_Position is used for
3098 * comparison against the user clip planes."
3099 *
3100 * This function is only called if the shader didn't write to
3101 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3102 * if the user wrote to it; otherwise we use gl_Position.
3103 */
3104 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3105 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3106 clip_vertex = VARYING_SLOT_POS;
3107 }
3108
3109 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3110 ++i) {
3111 reg.writemask = 1 << i;
3112 emit(DP4(reg,
3113 src_reg(output_reg[clip_vertex]),
3114 src_reg(this->userplane[i + offset])));
3115 }
3116 }
3117
3118 vec4_instruction *
3119 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3120 {
3121 assert (varying < VARYING_SLOT_MAX);
3122 reg.type = output_reg[varying].type;
3123 current_annotation = output_reg_annotation[varying];
3124 /* Copy the register, saturating if necessary */
3125 return emit(MOV(reg, src_reg(output_reg[varying])));
3126 }
3127
3128 void
3129 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3130 {
3131 reg.type = BRW_REGISTER_TYPE_F;
3132
3133 switch (varying) {
3134 case VARYING_SLOT_PSIZ:
3135 {
3136 /* PSIZ is always in slot 0, and is coupled with other flags. */
3137 current_annotation = "indices, point width, clip flags";
3138 emit_psiz_and_flags(reg);
3139 break;
3140 }
3141 case BRW_VARYING_SLOT_NDC:
3142 current_annotation = "NDC";
3143 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3144 break;
3145 case VARYING_SLOT_POS:
3146 current_annotation = "gl_Position";
3147 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3148 break;
3149 case VARYING_SLOT_EDGE:
3150 /* This is present when doing unfilled polygons. We're supposed to copy
3151 * the edge flag from the user-provided vertex array
3152 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3153 * of that attribute (starts as 1.0f). This is then used in clipping to
3154 * determine which edges should be drawn as wireframe.
3155 */
3156 current_annotation = "edge flag";
3157 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3158 glsl_type::float_type, WRITEMASK_XYZW))));
3159 break;
3160 case BRW_VARYING_SLOT_PAD:
3161 /* No need to write to this slot */
3162 break;
3163 case VARYING_SLOT_COL0:
3164 case VARYING_SLOT_COL1:
3165 case VARYING_SLOT_BFC0:
3166 case VARYING_SLOT_BFC1: {
3167 /* These built-in varyings are only supported in compatibility mode,
3168 * and we only support GS in core profile. So, this must be a vertex
3169 * shader.
3170 */
3171 assert(stage == MESA_SHADER_VERTEX);
3172 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3173 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3174 inst->saturate = true;
3175 break;
3176 }
3177
3178 default:
3179 emit_generic_urb_slot(reg, varying);
3180 break;
3181 }
3182 }
3183
3184 static int
3185 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3186 {
3187 if (devinfo->gen >= 6) {
3188 /* URB data written (does not include the message header reg) must
3189 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3190 * section 5.4.3.2.2: URB_INTERLEAVED.
3191 *
3192 * URB entries are allocated on a multiple of 1024 bits, so an
3193 * extra 128 bits written here to make the end align to 256 is
3194 * no problem.
3195 */
3196 if ((mlen % 2) != 1)
3197 mlen++;
3198 }
3199
3200 return mlen;
3201 }
3202
3203
3204 /**
3205 * Generates the VUE payload plus the necessary URB write instructions to
3206 * output it.
3207 *
3208 * The VUE layout is documented in Volume 2a.
3209 */
3210 void
3211 vec4_visitor::emit_vertex()
3212 {
3213 /* MRF 0 is reserved for the debugger, so start with message header
3214 * in MRF 1.
3215 */
3216 int base_mrf = 1;
3217 int mrf = base_mrf;
3218 /* In the process of generating our URB write message contents, we
3219 * may need to unspill a register or load from an array. Those
3220 * reads would use MRFs 14-15.
3221 */
3222 int max_usable_mrf = 13;
3223
3224 /* The following assertion verifies that max_usable_mrf causes an
3225 * even-numbered amount of URB write data, which will meet gen6's
3226 * requirements for length alignment.
3227 */
3228 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3229
3230 /* First mrf is the g0-based message header containing URB handles and
3231 * such.
3232 */
3233 emit_urb_write_header(mrf++);
3234
3235 if (devinfo->gen < 6) {
3236 emit_ndc_computation();
3237 }
3238
3239 /* Lower legacy ff and ClipVertex clipping to clip distances */
3240 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3241 current_annotation = "user clip distances";
3242
3243 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3244 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3245
3246 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3247 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3248 }
3249
3250 /* We may need to split this up into several URB writes, so do them in a
3251 * loop.
3252 */
3253 int slot = 0;
3254 bool complete = false;
3255 do {
3256 /* URB offset is in URB row increments, and each of our MRFs is half of
3257 * one of those, since we're doing interleaved writes.
3258 */
3259 int offset = slot / 2;
3260
3261 mrf = base_mrf + 1;
3262 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3263 emit_urb_slot(dst_reg(MRF, mrf++),
3264 prog_data->vue_map.slot_to_varying[slot]);
3265
3266 /* If this was max_usable_mrf, we can't fit anything more into this
3267 * URB WRITE.
3268 */
3269 if (mrf > max_usable_mrf) {
3270 slot++;
3271 break;
3272 }
3273 }
3274
3275 complete = slot >= prog_data->vue_map.num_slots;
3276 current_annotation = "URB write";
3277 vec4_instruction *inst = emit_urb_write_opcode(complete);
3278 inst->base_mrf = base_mrf;
3279 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3280 inst->offset += offset;
3281 } while(!complete);
3282 }
3283
3284
3285 src_reg
3286 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3287 src_reg *reladdr, int reg_offset)
3288 {
3289 /* Because we store the values to scratch interleaved like our
3290 * vertex data, we need to scale the vec4 index by 2.
3291 */
3292 int message_header_scale = 2;
3293
3294 /* Pre-gen6, the message header uses byte offsets instead of vec4
3295 * (16-byte) offset units.
3296 */
3297 if (devinfo->gen < 6)
3298 message_header_scale *= 16;
3299
3300 if (reladdr) {
3301 src_reg index = src_reg(this, glsl_type::int_type);
3302
3303 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3304 src_reg(reg_offset)));
3305 emit_before(block, inst, MUL(dst_reg(index), index,
3306 src_reg(message_header_scale)));
3307
3308 return index;
3309 } else {
3310 return src_reg(reg_offset * message_header_scale);
3311 }
3312 }
3313
3314 src_reg
3315 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3316 src_reg *reladdr, int reg_offset)
3317 {
3318 if (reladdr) {
3319 src_reg index = src_reg(this, glsl_type::int_type);
3320
3321 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3322 src_reg(reg_offset)));
3323
3324 /* Pre-gen6, the message header uses byte offsets instead of vec4
3325 * (16-byte) offset units.
3326 */
3327 if (devinfo->gen < 6) {
3328 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3329 }
3330
3331 return index;
3332 } else if (devinfo->gen >= 8) {
3333 /* Store the offset in a GRF so we can send-from-GRF. */
3334 src_reg offset = src_reg(this, glsl_type::int_type);
3335 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3336 return offset;
3337 } else {
3338 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3339 return src_reg(reg_offset * message_header_scale);
3340 }
3341 }
3342
3343 /**
3344 * Emits an instruction before @inst to load the value named by @orig_src
3345 * from scratch space at @base_offset to @temp.
3346 *
3347 * @base_offset is measured in 32-byte units (the size of a register).
3348 */
3349 void
3350 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3351 dst_reg temp, src_reg orig_src,
3352 int base_offset)
3353 {
3354 int reg_offset = base_offset + orig_src.reg_offset;
3355 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3356 reg_offset);
3357
3358 emit_before(block, inst, SCRATCH_READ(temp, index));
3359 }
3360
3361 /**
3362 * Emits an instruction after @inst to store the value to be written
3363 * to @orig_dst to scratch space at @base_offset, from @temp.
3364 *
3365 * @base_offset is measured in 32-byte units (the size of a register).
3366 */
3367 void
3368 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3369 int base_offset)
3370 {
3371 int reg_offset = base_offset + inst->dst.reg_offset;
3372 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3373 reg_offset);
3374
3375 /* Create a temporary register to store *inst's result in.
3376 *
3377 * We have to be careful in MOVing from our temporary result register in
3378 * the scratch write. If we swizzle from channels of the temporary that
3379 * weren't initialized, it will confuse live interval analysis, which will
3380 * make spilling fail to make progress.
3381 */
3382 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3383 inst->dst.type),
3384 brw_swizzle_for_mask(inst->dst.writemask));
3385 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3386 inst->dst.writemask));
3387 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3388 write->predicate = inst->predicate;
3389 write->ir = inst->ir;
3390 write->annotation = inst->annotation;
3391 inst->insert_after(block, write);
3392
3393 inst->dst.file = temp.file;
3394 inst->dst.reg = temp.reg;
3395 inst->dst.reg_offset = temp.reg_offset;
3396 inst->dst.reladdr = NULL;
3397 }
3398
3399 /**
3400 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3401 * adds the scratch read(s) before \p inst. The function also checks for
3402 * recursive reladdr scratch accesses, issuing the corresponding scratch
3403 * loads and rewriting reladdr references accordingly.
3404 *
3405 * \return \p src if it did not require a scratch load, otherwise, the
3406 * register holding the result of the scratch load that the caller should
3407 * use to rewrite src.
3408 */
3409 src_reg
3410 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3411 vec4_instruction *inst, src_reg src)
3412 {
3413 /* Resolve recursive reladdr scratch access by calling ourselves
3414 * with src.reladdr
3415 */
3416 if (src.reladdr)
3417 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3418 *src.reladdr);
3419
3420 /* Now handle scratch access on src */
3421 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3422 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3423 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3424 src.reg = temp.reg;
3425 src.reg_offset = temp.reg_offset;
3426 src.reladdr = NULL;
3427 }
3428
3429 return src;
3430 }
3431
3432 /**
3433 * We can't generally support array access in GRF space, because a
3434 * single instruction's destination can only span 2 contiguous
3435 * registers. So, we send all GRF arrays that get variable index
3436 * access to scratch space.
3437 */
3438 void
3439 vec4_visitor::move_grf_array_access_to_scratch()
3440 {
3441 int scratch_loc[this->alloc.count];
3442 memset(scratch_loc, -1, sizeof(scratch_loc));
3443
3444 /* First, calculate the set of virtual GRFs that need to be punted
3445 * to scratch due to having any array access on them, and where in
3446 * scratch.
3447 */
3448 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3449 if (inst->dst.file == GRF && inst->dst.reladdr) {
3450 if (scratch_loc[inst->dst.reg] == -1) {
3451 scratch_loc[inst->dst.reg] = c->last_scratch;
3452 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3453 }
3454
3455 for (src_reg *iter = inst->dst.reladdr;
3456 iter->reladdr;
3457 iter = iter->reladdr) {
3458 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3459 scratch_loc[iter->reg] = c->last_scratch;
3460 c->last_scratch += this->alloc.sizes[iter->reg];
3461 }
3462 }
3463 }
3464
3465 for (int i = 0 ; i < 3; i++) {
3466 for (src_reg *iter = &inst->src[i];
3467 iter->reladdr;
3468 iter = iter->reladdr) {
3469 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3470 scratch_loc[iter->reg] = c->last_scratch;
3471 c->last_scratch += this->alloc.sizes[iter->reg];
3472 }
3473 }
3474 }
3475 }
3476
3477 /* Now, for anything that will be accessed through scratch, rewrite
3478 * it to load/store. Note that this is a _safe list walk, because
3479 * we may generate a new scratch_write instruction after the one
3480 * we're processing.
3481 */
3482 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3483 /* Set up the annotation tracking for new generated instructions. */
3484 base_ir = inst->ir;
3485 current_annotation = inst->annotation;
3486
3487 /* First handle scratch access on the dst. Notice we have to handle
3488 * the case where the dst's reladdr also points to scratch space.
3489 */
3490 if (inst->dst.reladdr)
3491 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3492 *inst->dst.reladdr);
3493
3494 /* Now that we have handled any (possibly recursive) reladdr scratch
3495 * accesses for dst we can safely do the scratch write for dst itself
3496 */
3497 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3498 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3499
3500 /* Now handle scratch access on any src. In this case, since inst->src[i]
3501 * already is a src_reg, we can just call emit_resolve_reladdr with
3502 * inst->src[i] and it will take care of handling scratch loads for
3503 * both src and src.reladdr (recursively).
3504 */
3505 for (int i = 0 ; i < 3; i++) {
3506 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3507 inst->src[i]);
3508 }
3509 }
3510 }
3511
3512 /**
3513 * Emits an instruction before @inst to load the value named by @orig_src
3514 * from the pull constant buffer (surface) at @base_offset to @temp.
3515 */
3516 void
3517 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3518 dst_reg temp, src_reg orig_src,
3519 int base_offset)
3520 {
3521 int reg_offset = base_offset + orig_src.reg_offset;
3522 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3523 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3524 reg_offset);
3525
3526 emit_pull_constant_load_reg(temp,
3527 index,
3528 offset,
3529 block, inst);
3530 }
3531
3532 /**
3533 * Implements array access of uniforms by inserting a
3534 * PULL_CONSTANT_LOAD instruction.
3535 *
3536 * Unlike temporary GRF array access (where we don't support it due to
3537 * the difficulty of doing relative addressing on instruction
3538 * destinations), we could potentially do array access of uniforms
3539 * that were loaded in GRF space as push constants. In real-world
3540 * usage we've seen, though, the arrays being used are always larger
3541 * than we could load as push constants, so just always move all
3542 * uniform array access out to a pull constant buffer.
3543 */
3544 void
3545 vec4_visitor::move_uniform_array_access_to_pull_constants()
3546 {
3547 int pull_constant_loc[this->uniforms];
3548 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3549 bool nested_reladdr;
3550
3551 /* Walk through and find array access of uniforms. Put a copy of that
3552 * uniform in the pull constant buffer.
3553 *
3554 * Note that we don't move constant-indexed accesses to arrays. No
3555 * testing has been done of the performance impact of this choice.
3556 */
3557 do {
3558 nested_reladdr = false;
3559
3560 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3561 for (int i = 0 ; i < 3; i++) {
3562 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3563 continue;
3564
3565 int uniform = inst->src[i].reg;
3566
3567 if (inst->src[i].reladdr->reladdr)
3568 nested_reladdr = true; /* will need another pass */
3569
3570 /* If this array isn't already present in the pull constant buffer,
3571 * add it.
3572 */
3573 if (pull_constant_loc[uniform] == -1) {
3574 const gl_constant_value **values =
3575 &stage_prog_data->param[uniform * 4];
3576
3577 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3578
3579 assert(uniform < uniform_array_size);
3580 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3581 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3582 = values[j];
3583 }
3584 }
3585
3586 /* Set up the annotation tracking for new generated instructions. */
3587 base_ir = inst->ir;
3588 current_annotation = inst->annotation;
3589
3590 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3591
3592 emit_pull_constant_load(block, inst, temp, inst->src[i],
3593 pull_constant_loc[uniform]);
3594
3595 inst->src[i].file = temp.file;
3596 inst->src[i].reg = temp.reg;
3597 inst->src[i].reg_offset = temp.reg_offset;
3598 inst->src[i].reladdr = NULL;
3599 }
3600 }
3601 } while (nested_reladdr);
3602
3603 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3604 * no need to track them as larger-than-vec4 objects. This will be
3605 * relied on in cutting out unused uniform vectors from push
3606 * constants.
3607 */
3608 split_uniform_registers();
3609 }
3610
3611 void
3612 vec4_visitor::resolve_ud_negate(src_reg *reg)
3613 {
3614 if (reg->type != BRW_REGISTER_TYPE_UD ||
3615 !reg->negate)
3616 return;
3617
3618 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3619 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3620 *reg = temp;
3621 }
3622
3623 /**
3624 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3625 *
3626 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3627 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3628 */
3629 void
3630 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3631 {
3632 assert(devinfo->gen <= 5);
3633
3634 if (!rvalue->type->is_boolean())
3635 return;
3636
3637 src_reg and_result = src_reg(this, rvalue->type);
3638 src_reg neg_result = src_reg(this, rvalue->type);
3639 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3640 emit(MOV(dst_reg(neg_result), negate(and_result)));
3641 *reg = neg_result;
3642 }
3643
3644 vec4_visitor::vec4_visitor(struct brw_context *brw,
3645 struct brw_vec4_compile *c,
3646 struct gl_program *prog,
3647 const struct brw_vue_prog_key *key,
3648 struct brw_vue_prog_data *prog_data,
3649 struct gl_shader_program *shader_prog,
3650 gl_shader_stage stage,
3651 void *mem_ctx,
3652 bool no_spills,
3653 shader_time_shader_type st_base,
3654 shader_time_shader_type st_written,
3655 shader_time_shader_type st_reset)
3656 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3657 c(c),
3658 key(key),
3659 prog_data(prog_data),
3660 sanity_param_count(0),
3661 fail_msg(NULL),
3662 first_non_payload_grf(0),
3663 need_all_constants_in_pull_buffer(false),
3664 no_spills(no_spills),
3665 st_base(st_base),
3666 st_written(st_written),
3667 st_reset(st_reset)
3668 {
3669 this->mem_ctx = mem_ctx;
3670 this->failed = false;
3671
3672 this->base_ir = NULL;
3673 this->current_annotation = NULL;
3674 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3675
3676 this->variable_ht = hash_table_ctor(0,
3677 hash_table_pointer_hash,
3678 hash_table_pointer_compare);
3679
3680 this->virtual_grf_start = NULL;
3681 this->virtual_grf_end = NULL;
3682 this->live_intervals = NULL;
3683
3684 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3685
3686 this->uniforms = 0;
3687
3688 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3689 * at least one. See setup_uniforms() in brw_vec4.cpp.
3690 */
3691 this->uniform_array_size = 1;
3692 if (prog_data) {
3693 this->uniform_array_size =
3694 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3695 }
3696
3697 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3698 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3699 }
3700
3701 vec4_visitor::~vec4_visitor()
3702 {
3703 hash_table_dtor(this->variable_ht);
3704 }
3705
3706
3707 void
3708 vec4_visitor::fail(const char *format, ...)
3709 {
3710 va_list va;
3711 char *msg;
3712
3713 if (failed)
3714 return;
3715
3716 failed = true;
3717
3718 va_start(va, format);
3719 msg = ralloc_vasprintf(mem_ctx, format, va);
3720 va_end(va);
3721 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3722
3723 this->fail_msg = msg;
3724
3725 if (debug_enabled) {
3726 fprintf(stderr, "%s", msg);
3727 }
3728 }
3729
3730 } /* namespace brw */