i965/vec4: Plumb log_data through so the backend_shader field gets set.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (storage->builtin)
690 continue;
691
692 if (strncmp(ir->name, storage->name, namelen) != 0 ||
693 (storage->name[namelen] != 0 &&
694 storage->name[namelen] != '.' &&
695 storage->name[namelen] != '[')) {
696 continue;
697 }
698
699 gl_constant_value *components = storage->storage;
700 unsigned vector_count = (MAX2(storage->array_elements, 1) *
701 storage->type->matrix_columns);
702
703 for (unsigned s = 0; s < vector_count; s++) {
704 assert(uniforms < uniform_array_size);
705 uniform_vector_size[uniforms] = storage->type->vector_elements;
706
707 int i;
708 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
709 stage_prog_data->param[uniforms * 4 + i] = components;
710 components++;
711 }
712 for (; i < 4; i++) {
713 static gl_constant_value zero = { 0.0 };
714 stage_prog_data->param[uniforms * 4 + i] = &zero;
715 }
716
717 uniforms++;
718 }
719 }
720 }
721
722 void
723 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
724 {
725 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
726 assert(this->uniforms < uniform_array_size);
727 this->uniform_vector_size[this->uniforms] = 4;
728 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
729 this->userplane[i].type = BRW_REGISTER_TYPE_F;
730 for (int j = 0; j < 4; ++j) {
731 stage_prog_data->param[this->uniforms * 4 + j] =
732 (gl_constant_value *) &clip_planes[i][j];
733 }
734 ++this->uniforms;
735 }
736 }
737
738 /* Our support for builtin uniforms is even scarier than non-builtin.
739 * It sits on top of the PROG_STATE_VAR parameters that are
740 * automatically updated from GL context state.
741 */
742 void
743 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
744 {
745 const ir_state_slot *const slots = ir->get_state_slots();
746 assert(slots != NULL);
747
748 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
749 /* This state reference has already been setup by ir_to_mesa,
750 * but we'll get the same index back here. We can reference
751 * ParameterValues directly, since unlike brw_fs.cpp, we never
752 * add new state references during compile.
753 */
754 int index = _mesa_add_state_reference(this->prog->Parameters,
755 (gl_state_index *)slots[i].tokens);
756 gl_constant_value *values =
757 &this->prog->Parameters->ParameterValues[index][0];
758
759 assert(this->uniforms < uniform_array_size);
760
761 for (unsigned j = 0; j < 4; j++)
762 stage_prog_data->param[this->uniforms * 4 + j] =
763 &values[GET_SWZ(slots[i].swizzle, j)];
764
765 this->uniform_vector_size[this->uniforms] =
766 (ir->type->is_scalar() || ir->type->is_vector() ||
767 ir->type->is_matrix() ? ir->type->vector_elements : 4);
768
769 this->uniforms++;
770 }
771 }
772
773 dst_reg *
774 vec4_visitor::variable_storage(ir_variable *var)
775 {
776 return (dst_reg *)hash_table_find(this->variable_ht, var);
777 }
778
779 void
780 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
781 enum brw_predicate *predicate)
782 {
783 ir_expression *expr = ir->as_expression();
784
785 *predicate = BRW_PREDICATE_NORMAL;
786
787 if (expr && expr->operation != ir_binop_ubo_load) {
788 src_reg op[3];
789 vec4_instruction *inst;
790
791 assert(expr->get_num_operands() <= 3);
792 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
793 expr->operands[i]->accept(this);
794 op[i] = this->result;
795
796 resolve_ud_negate(&op[i]);
797 }
798
799 switch (expr->operation) {
800 case ir_unop_logic_not:
801 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
802 inst->conditional_mod = BRW_CONDITIONAL_Z;
803 break;
804
805 case ir_binop_logic_xor:
806 if (devinfo->gen <= 5) {
807 src_reg temp = src_reg(this, ir->type);
808 emit(XOR(dst_reg(temp), op[0], op[1]));
809 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
810 } else {
811 inst = emit(XOR(dst_null_d(), op[0], op[1]));
812 }
813 inst->conditional_mod = BRW_CONDITIONAL_NZ;
814 break;
815
816 case ir_binop_logic_or:
817 if (devinfo->gen <= 5) {
818 src_reg temp = src_reg(this, ir->type);
819 emit(OR(dst_reg(temp), op[0], op[1]));
820 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
821 } else {
822 inst = emit(OR(dst_null_d(), op[0], op[1]));
823 }
824 inst->conditional_mod = BRW_CONDITIONAL_NZ;
825 break;
826
827 case ir_binop_logic_and:
828 if (devinfo->gen <= 5) {
829 src_reg temp = src_reg(this, ir->type);
830 emit(AND(dst_reg(temp), op[0], op[1]));
831 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
832 } else {
833 inst = emit(AND(dst_null_d(), op[0], op[1]));
834 }
835 inst->conditional_mod = BRW_CONDITIONAL_NZ;
836 break;
837
838 case ir_unop_f2b:
839 if (devinfo->gen >= 6) {
840 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
841 } else {
842 inst = emit(MOV(dst_null_f(), op[0]));
843 inst->conditional_mod = BRW_CONDITIONAL_NZ;
844 }
845 break;
846
847 case ir_unop_i2b:
848 if (devinfo->gen >= 6) {
849 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
850 } else {
851 inst = emit(MOV(dst_null_d(), op[0]));
852 inst->conditional_mod = BRW_CONDITIONAL_NZ;
853 }
854 break;
855
856 case ir_binop_all_equal:
857 if (devinfo->gen <= 5) {
858 resolve_bool_comparison(expr->operands[0], &op[0]);
859 resolve_bool_comparison(expr->operands[1], &op[1]);
860 }
861 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
862 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
863 break;
864
865 case ir_binop_any_nequal:
866 if (devinfo->gen <= 5) {
867 resolve_bool_comparison(expr->operands[0], &op[0]);
868 resolve_bool_comparison(expr->operands[1], &op[1]);
869 }
870 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
871 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
872 break;
873
874 case ir_unop_any:
875 if (devinfo->gen <= 5) {
876 resolve_bool_comparison(expr->operands[0], &op[0]);
877 }
878 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
879 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
880 break;
881
882 case ir_binop_greater:
883 case ir_binop_gequal:
884 case ir_binop_less:
885 case ir_binop_lequal:
886 case ir_binop_equal:
887 case ir_binop_nequal:
888 if (devinfo->gen <= 5) {
889 resolve_bool_comparison(expr->operands[0], &op[0]);
890 resolve_bool_comparison(expr->operands[1], &op[1]);
891 }
892 emit(CMP(dst_null_d(), op[0], op[1],
893 brw_conditional_for_comparison(expr->operation)));
894 break;
895
896 case ir_triop_csel: {
897 /* Expand the boolean condition into the flag register. */
898 inst = emit(MOV(dst_null_d(), op[0]));
899 inst->conditional_mod = BRW_CONDITIONAL_NZ;
900
901 /* Select which boolean to return. */
902 dst_reg temp(this, expr->operands[1]->type);
903 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
904 inst->predicate = BRW_PREDICATE_NORMAL;
905
906 /* Expand the result to a condition code. */
907 inst = emit(MOV(dst_null_d(), src_reg(temp)));
908 inst->conditional_mod = BRW_CONDITIONAL_NZ;
909 break;
910 }
911
912 default:
913 unreachable("not reached");
914 }
915 return;
916 }
917
918 ir->accept(this);
919
920 resolve_ud_negate(&this->result);
921
922 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
923 inst->conditional_mod = BRW_CONDITIONAL_NZ;
924 }
925
926 /**
927 * Emit a gen6 IF statement with the comparison folded into the IF
928 * instruction.
929 */
930 void
931 vec4_visitor::emit_if_gen6(ir_if *ir)
932 {
933 ir_expression *expr = ir->condition->as_expression();
934
935 if (expr && expr->operation != ir_binop_ubo_load) {
936 src_reg op[3];
937 dst_reg temp;
938
939 assert(expr->get_num_operands() <= 3);
940 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
941 expr->operands[i]->accept(this);
942 op[i] = this->result;
943 }
944
945 switch (expr->operation) {
946 case ir_unop_logic_not:
947 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
948 return;
949
950 case ir_binop_logic_xor:
951 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
952 return;
953
954 case ir_binop_logic_or:
955 temp = dst_reg(this, glsl_type::bool_type);
956 emit(OR(temp, op[0], op[1]));
957 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
958 return;
959
960 case ir_binop_logic_and:
961 temp = dst_reg(this, glsl_type::bool_type);
962 emit(AND(temp, op[0], op[1]));
963 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
964 return;
965
966 case ir_unop_f2b:
967 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_unop_i2b:
971 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_binop_greater:
975 case ir_binop_gequal:
976 case ir_binop_less:
977 case ir_binop_lequal:
978 case ir_binop_equal:
979 case ir_binop_nequal:
980 emit(IF(op[0], op[1],
981 brw_conditional_for_comparison(expr->operation)));
982 return;
983
984 case ir_binop_all_equal:
985 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
986 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
987 return;
988
989 case ir_binop_any_nequal:
990 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
991 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
992 return;
993
994 case ir_unop_any:
995 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
996 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
997 return;
998
999 case ir_triop_csel: {
1000 /* Expand the boolean condition into the flag register. */
1001 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1002 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1003
1004 /* Select which boolean to return. */
1005 dst_reg temp(this, expr->operands[1]->type);
1006 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1007 inst->predicate = BRW_PREDICATE_NORMAL;
1008
1009 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1010 return;
1011 }
1012
1013 default:
1014 unreachable("not reached");
1015 }
1016 return;
1017 }
1018
1019 ir->condition->accept(this);
1020
1021 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_variable *ir)
1026 {
1027 dst_reg *reg = NULL;
1028
1029 if (variable_storage(ir))
1030 return;
1031
1032 switch (ir->data.mode) {
1033 case ir_var_shader_in:
1034 assert(ir->data.location != -1);
1035 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1036 break;
1037
1038 case ir_var_shader_out:
1039 assert(ir->data.location != -1);
1040 reg = new(mem_ctx) dst_reg(this, ir->type);
1041
1042 for (int i = 0; i < type_size(ir->type); i++) {
1043 output_reg[ir->data.location + i] = *reg;
1044 output_reg[ir->data.location + i].reg_offset = i;
1045 output_reg[ir->data.location + i].type =
1046 brw_type_for_base_type(ir->type->get_scalar_type());
1047 output_reg_annotation[ir->data.location + i] = ir->name;
1048 }
1049 break;
1050
1051 case ir_var_auto:
1052 case ir_var_temporary:
1053 reg = new(mem_ctx) dst_reg(this, ir->type);
1054 break;
1055
1056 case ir_var_uniform:
1057 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1058
1059 /* Thanks to the lower_ubo_reference pass, we will see only
1060 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1061 * variables, so no need for them to be in variable_ht.
1062 *
1063 * Some uniforms, such as samplers and atomic counters, have no actual
1064 * storage, so we should ignore them.
1065 */
1066 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1067 return;
1068
1069 /* Track how big the whole uniform variable is, in case we need to put a
1070 * copy of its data into pull constants for array access.
1071 */
1072 assert(this->uniforms < uniform_array_size);
1073 this->uniform_size[this->uniforms] = type_size(ir->type);
1074
1075 if (!strncmp(ir->name, "gl_", 3)) {
1076 setup_builtin_uniform_values(ir);
1077 } else {
1078 setup_uniform_values(ir);
1079 }
1080 break;
1081
1082 case ir_var_system_value:
1083 reg = make_reg_for_system_value(ir);
1084 break;
1085
1086 default:
1087 unreachable("not reached");
1088 }
1089
1090 reg->type = brw_type_for_base_type(ir->type);
1091 hash_table_insert(this->variable_ht, reg, ir);
1092 }
1093
1094 void
1095 vec4_visitor::visit(ir_loop *ir)
1096 {
1097 /* We don't want debugging output to print the whole body of the
1098 * loop as the annotation.
1099 */
1100 this->base_ir = NULL;
1101
1102 emit(BRW_OPCODE_DO);
1103
1104 visit_instructions(&ir->body_instructions);
1105
1106 emit(BRW_OPCODE_WHILE);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop_jump *ir)
1111 {
1112 switch (ir->mode) {
1113 case ir_loop_jump::jump_break:
1114 emit(BRW_OPCODE_BREAK);
1115 break;
1116 case ir_loop_jump::jump_continue:
1117 emit(BRW_OPCODE_CONTINUE);
1118 break;
1119 }
1120 }
1121
1122
1123 void
1124 vec4_visitor::visit(ir_function_signature *)
1125 {
1126 unreachable("not reached");
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_function *ir)
1131 {
1132 /* Ignore function bodies other than main() -- we shouldn't see calls to
1133 * them since they should all be inlined.
1134 */
1135 if (strcmp(ir->name, "main") == 0) {
1136 const ir_function_signature *sig;
1137 exec_list empty;
1138
1139 sig = ir->matching_signature(NULL, &empty, false);
1140
1141 assert(sig);
1142
1143 visit_instructions(&sig->body);
1144 }
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_mad(ir_expression *ir)
1149 {
1150 /* 3-src instructions were introduced in gen6. */
1151 if (devinfo->gen < 6)
1152 return false;
1153
1154 /* MAD can only handle floating-point data. */
1155 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1156 return false;
1157
1158 ir_rvalue *nonmul;
1159 ir_expression *mul;
1160 bool mul_negate, mul_abs;
1161
1162 for (int i = 0; i < 2; i++) {
1163 mul_negate = false;
1164 mul_abs = false;
1165
1166 mul = ir->operands[i]->as_expression();
1167 nonmul = ir->operands[1 - i];
1168
1169 if (mul && mul->operation == ir_unop_abs) {
1170 mul = mul->operands[0]->as_expression();
1171 mul_abs = true;
1172 } else if (mul && mul->operation == ir_unop_neg) {
1173 mul = mul->operands[0]->as_expression();
1174 mul_negate = true;
1175 }
1176
1177 if (mul && mul->operation == ir_binop_mul)
1178 break;
1179 }
1180
1181 if (!mul || mul->operation != ir_binop_mul)
1182 return false;
1183
1184 nonmul->accept(this);
1185 src_reg src0 = fix_3src_operand(this->result);
1186
1187 mul->operands[0]->accept(this);
1188 src_reg src1 = fix_3src_operand(this->result);
1189 src1.negate ^= mul_negate;
1190 src1.abs = mul_abs;
1191 if (mul_abs)
1192 src1.negate = false;
1193
1194 mul->operands[1]->accept(this);
1195 src_reg src2 = fix_3src_operand(this->result);
1196 src2.abs = mul_abs;
1197 if (mul_abs)
1198 src2.negate = false;
1199
1200 this->result = src_reg(this, ir->type);
1201 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1202
1203 return true;
1204 }
1205
1206 bool
1207 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1208 {
1209 /* This optimization relies on CMP setting the destination to 0 when
1210 * false. Early hardware only sets the least significant bit, and
1211 * leaves the other bits undefined. So we can't use it.
1212 */
1213 if (devinfo->gen < 6)
1214 return false;
1215
1216 ir_expression *const cmp = ir->operands[0]->as_expression();
1217
1218 if (cmp == NULL)
1219 return false;
1220
1221 switch (cmp->operation) {
1222 case ir_binop_less:
1223 case ir_binop_greater:
1224 case ir_binop_lequal:
1225 case ir_binop_gequal:
1226 case ir_binop_equal:
1227 case ir_binop_nequal:
1228 break;
1229
1230 default:
1231 return false;
1232 }
1233
1234 cmp->operands[0]->accept(this);
1235 const src_reg cmp_src0 = this->result;
1236
1237 cmp->operands[1]->accept(this);
1238 const src_reg cmp_src1 = this->result;
1239
1240 this->result = src_reg(this, ir->type);
1241
1242 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1243 brw_conditional_for_comparison(cmp->operation)));
1244
1245 /* If the comparison is false, this->result will just happen to be zero.
1246 */
1247 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1248 this->result, src_reg(1.0f));
1249 inst->predicate = BRW_PREDICATE_NORMAL;
1250 inst->predicate_inverse = true;
1251
1252 return true;
1253 }
1254
1255 void
1256 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1257 src_reg src0, src_reg src1)
1258 {
1259 vec4_instruction *inst;
1260
1261 if (devinfo->gen >= 6) {
1262 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1263 inst->conditional_mod = conditionalmod;
1264 } else {
1265 emit(CMP(dst, src0, src1, conditionalmod));
1266
1267 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268 inst->predicate = BRW_PREDICATE_NORMAL;
1269 }
1270 }
1271
1272 void
1273 vec4_visitor::emit_lrp(const dst_reg &dst,
1274 const src_reg &x, const src_reg &y, const src_reg &a)
1275 {
1276 if (devinfo->gen >= 6) {
1277 /* Note that the instruction's argument order is reversed from GLSL
1278 * and the IR.
1279 */
1280 emit(LRP(dst,
1281 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1282 } else {
1283 /* Earlier generations don't support three source operations, so we
1284 * need to emit x*(1-a) + y*a.
1285 */
1286 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1287 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1289 y_times_a.writemask = dst.writemask;
1290 one_minus_a.writemask = dst.writemask;
1291 x_times_one_minus_a.writemask = dst.writemask;
1292
1293 emit(MUL(y_times_a, y, a));
1294 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1295 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1296 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1297 }
1298 }
1299
1300 /**
1301 * Emits the instructions needed to perform a pull constant load. before_block
1302 * and before_inst can be NULL in which case the instruction will be appended
1303 * to the end of the instruction list.
1304 */
1305 void
1306 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1307 src_reg surf_index,
1308 src_reg offset_reg,
1309 bblock_t *before_block,
1310 vec4_instruction *before_inst)
1311 {
1312 assert((before_inst == NULL && before_block == NULL) ||
1313 (before_inst && before_block));
1314
1315 vec4_instruction *pull;
1316
1317 if (devinfo->gen >= 9) {
1318 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1319 src_reg header(this, glsl_type::uvec4_type, 2);
1320
1321 pull = new(mem_ctx)
1322 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1323 dst_reg(header));
1324
1325 if (before_inst)
1326 emit_before(before_block, before_inst, pull);
1327 else
1328 emit(pull);
1329
1330 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1331 offset_reg.type);
1332 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1333
1334 if (before_inst)
1335 emit_before(before_block, before_inst, pull);
1336 else
1337 emit(pull);
1338
1339 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1340 dst,
1341 surf_index,
1342 header);
1343 pull->mlen = 2;
1344 pull->header_size = 1;
1345 } else if (devinfo->gen >= 7) {
1346 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1347
1348 grf_offset.type = offset_reg.type;
1349
1350 pull = MOV(grf_offset, offset_reg);
1351
1352 if (before_inst)
1353 emit_before(before_block, before_inst, pull);
1354 else
1355 emit(pull);
1356
1357 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1358 dst,
1359 surf_index,
1360 src_reg(grf_offset));
1361 pull->mlen = 1;
1362 } else {
1363 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1364 dst,
1365 surf_index,
1366 offset_reg);
1367 pull->base_mrf = 14;
1368 pull->mlen = 1;
1369 }
1370
1371 if (before_inst)
1372 emit_before(before_block, before_inst, pull);
1373 else
1374 emit(pull);
1375 }
1376
1377 void
1378 vec4_visitor::emit_uniformize(const dst_reg &dst, const src_reg &src)
1379 {
1380 const src_reg chan_index(this, glsl_type::uint_type);
1381
1382 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1383 ->force_writemask_all = true;
1384 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1385 ->force_writemask_all = true;
1386 }
1387
1388 void
1389 vec4_visitor::visit(ir_expression *ir)
1390 {
1391 unsigned int operand;
1392 src_reg op[ARRAY_SIZE(ir->operands)];
1393 vec4_instruction *inst;
1394
1395 if (ir->operation == ir_binop_add) {
1396 if (try_emit_mad(ir))
1397 return;
1398 }
1399
1400 if (ir->operation == ir_unop_b2f) {
1401 if (try_emit_b2f_of_compare(ir))
1402 return;
1403 }
1404
1405 /* Storage for our result. Ideally for an assignment we'd be using
1406 * the actual storage for the result here, instead.
1407 */
1408 dst_reg result_dst(this, ir->type);
1409 src_reg result_src(result_dst);
1410
1411 if (ir->operation == ir_triop_csel) {
1412 ir->operands[1]->accept(this);
1413 op[1] = this->result;
1414 ir->operands[2]->accept(this);
1415 op[2] = this->result;
1416
1417 enum brw_predicate predicate;
1418 emit_bool_to_cond_code(ir->operands[0], &predicate);
1419 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1420 inst->predicate = predicate;
1421 this->result = result_src;
1422 return;
1423 }
1424
1425 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1426 this->result.file = BAD_FILE;
1427 ir->operands[operand]->accept(this);
1428 if (this->result.file == BAD_FILE) {
1429 fprintf(stderr, "Failed to get tree for expression operand:\n");
1430 ir->operands[operand]->fprint(stderr);
1431 exit(1);
1432 }
1433 op[operand] = this->result;
1434
1435 /* Matrix expression operands should have been broken down to vector
1436 * operations already.
1437 */
1438 assert(!ir->operands[operand]->type->is_matrix());
1439 }
1440
1441 /* If nothing special happens, this is the result. */
1442 this->result = result_src;
1443
1444 switch (ir->operation) {
1445 case ir_unop_logic_not:
1446 emit(NOT(result_dst, op[0]));
1447 break;
1448 case ir_unop_neg:
1449 op[0].negate = !op[0].negate;
1450 emit(MOV(result_dst, op[0]));
1451 break;
1452 case ir_unop_abs:
1453 op[0].abs = true;
1454 op[0].negate = false;
1455 emit(MOV(result_dst, op[0]));
1456 break;
1457
1458 case ir_unop_sign:
1459 if (ir->type->is_float()) {
1460 /* AND(val, 0x80000000) gives the sign bit.
1461 *
1462 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1463 * zero.
1464 */
1465 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1466
1467 op[0].type = BRW_REGISTER_TYPE_UD;
1468 result_dst.type = BRW_REGISTER_TYPE_UD;
1469 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1470
1471 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1472 inst->predicate = BRW_PREDICATE_NORMAL;
1473
1474 this->result.type = BRW_REGISTER_TYPE_F;
1475 } else {
1476 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1477 * -> non-negative val generates 0x00000000.
1478 * Predicated OR sets 1 if val is positive.
1479 */
1480 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1481
1482 emit(ASR(result_dst, op[0], src_reg(31)));
1483
1484 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1485 inst->predicate = BRW_PREDICATE_NORMAL;
1486 }
1487 break;
1488
1489 case ir_unop_rcp:
1490 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1491 break;
1492
1493 case ir_unop_exp2:
1494 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1495 break;
1496 case ir_unop_log2:
1497 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1498 break;
1499 case ir_unop_exp:
1500 case ir_unop_log:
1501 unreachable("not reached: should be handled by ir_explog_to_explog2");
1502 case ir_unop_sin:
1503 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1504 break;
1505 case ir_unop_cos:
1506 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1507 break;
1508
1509 case ir_unop_dFdx:
1510 case ir_unop_dFdx_coarse:
1511 case ir_unop_dFdx_fine:
1512 case ir_unop_dFdy:
1513 case ir_unop_dFdy_coarse:
1514 case ir_unop_dFdy_fine:
1515 unreachable("derivatives not valid in vertex shader");
1516
1517 case ir_unop_bitfield_reverse:
1518 emit(BFREV(result_dst, op[0]));
1519 break;
1520 case ir_unop_bit_count:
1521 emit(CBIT(result_dst, op[0]));
1522 break;
1523 case ir_unop_find_msb: {
1524 src_reg temp = src_reg(this, glsl_type::uint_type);
1525
1526 inst = emit(FBH(dst_reg(temp), op[0]));
1527 inst->dst.writemask = WRITEMASK_XYZW;
1528
1529 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1530 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1531 * subtract the result from 31 to convert the MSB count into an LSB count.
1532 */
1533
1534 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1535 temp.swizzle = BRW_SWIZZLE_NOOP;
1536 emit(MOV(result_dst, temp));
1537
1538 src_reg src_tmp = src_reg(result_dst);
1539 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1540
1541 src_tmp.negate = true;
1542 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1543 inst->predicate = BRW_PREDICATE_NORMAL;
1544 break;
1545 }
1546 case ir_unop_find_lsb:
1547 emit(FBL(result_dst, op[0]));
1548 break;
1549 case ir_unop_saturate:
1550 inst = emit(MOV(result_dst, op[0]));
1551 inst->saturate = true;
1552 break;
1553
1554 case ir_unop_noise:
1555 unreachable("not reached: should be handled by lower_noise");
1556
1557 case ir_binop_add:
1558 emit(ADD(result_dst, op[0], op[1]));
1559 break;
1560 case ir_binop_sub:
1561 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1562
1563 case ir_binop_mul:
1564 if (devinfo->gen < 8 && ir->type->is_integer()) {
1565 /* For integer multiplication, the MUL uses the low 16 bits of one of
1566 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1567 * accumulates in the contribution of the upper 16 bits of that
1568 * operand. If we can determine that one of the args is in the low
1569 * 16 bits, though, we can just emit a single MUL.
1570 */
1571 if (ir->operands[0]->is_uint16_constant()) {
1572 if (devinfo->gen < 7)
1573 emit(MUL(result_dst, op[0], op[1]));
1574 else
1575 emit(MUL(result_dst, op[1], op[0]));
1576 } else if (ir->operands[1]->is_uint16_constant()) {
1577 if (devinfo->gen < 7)
1578 emit(MUL(result_dst, op[1], op[0]));
1579 else
1580 emit(MUL(result_dst, op[0], op[1]));
1581 } else {
1582 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1583
1584 emit(MUL(acc, op[0], op[1]));
1585 emit(MACH(dst_null_d(), op[0], op[1]));
1586 emit(MOV(result_dst, src_reg(acc)));
1587 }
1588 } else {
1589 emit(MUL(result_dst, op[0], op[1]));
1590 }
1591 break;
1592 case ir_binop_imul_high: {
1593 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1594
1595 emit(MUL(acc, op[0], op[1]));
1596 emit(MACH(result_dst, op[0], op[1]));
1597 break;
1598 }
1599 case ir_binop_div:
1600 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1601 assert(ir->type->is_integer());
1602 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1603 break;
1604 case ir_binop_carry: {
1605 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1606
1607 emit(ADDC(dst_null_ud(), op[0], op[1]));
1608 emit(MOV(result_dst, src_reg(acc)));
1609 break;
1610 }
1611 case ir_binop_borrow: {
1612 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1613
1614 emit(SUBB(dst_null_ud(), op[0], op[1]));
1615 emit(MOV(result_dst, src_reg(acc)));
1616 break;
1617 }
1618 case ir_binop_mod:
1619 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1620 assert(ir->type->is_integer());
1621 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1622 break;
1623
1624 case ir_binop_less:
1625 case ir_binop_greater:
1626 case ir_binop_lequal:
1627 case ir_binop_gequal:
1628 case ir_binop_equal:
1629 case ir_binop_nequal: {
1630 if (devinfo->gen <= 5) {
1631 resolve_bool_comparison(ir->operands[0], &op[0]);
1632 resolve_bool_comparison(ir->operands[1], &op[1]);
1633 }
1634 emit(CMP(result_dst, op[0], op[1],
1635 brw_conditional_for_comparison(ir->operation)));
1636 break;
1637 }
1638
1639 case ir_binop_all_equal:
1640 if (devinfo->gen <= 5) {
1641 resolve_bool_comparison(ir->operands[0], &op[0]);
1642 resolve_bool_comparison(ir->operands[1], &op[1]);
1643 }
1644
1645 /* "==" operator producing a scalar boolean. */
1646 if (ir->operands[0]->type->is_vector() ||
1647 ir->operands[1]->type->is_vector()) {
1648 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1649 emit(MOV(result_dst, src_reg(0)));
1650 inst = emit(MOV(result_dst, src_reg(~0)));
1651 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1652 } else {
1653 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1654 }
1655 break;
1656 case ir_binop_any_nequal:
1657 if (devinfo->gen <= 5) {
1658 resolve_bool_comparison(ir->operands[0], &op[0]);
1659 resolve_bool_comparison(ir->operands[1], &op[1]);
1660 }
1661
1662 /* "!=" operator producing a scalar boolean. */
1663 if (ir->operands[0]->type->is_vector() ||
1664 ir->operands[1]->type->is_vector()) {
1665 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1666
1667 emit(MOV(result_dst, src_reg(0)));
1668 inst = emit(MOV(result_dst, src_reg(~0)));
1669 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1670 } else {
1671 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1672 }
1673 break;
1674
1675 case ir_unop_any:
1676 if (devinfo->gen <= 5) {
1677 resolve_bool_comparison(ir->operands[0], &op[0]);
1678 }
1679 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1680 emit(MOV(result_dst, src_reg(0)));
1681
1682 inst = emit(MOV(result_dst, src_reg(~0)));
1683 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1684 break;
1685
1686 case ir_binop_logic_xor:
1687 emit(XOR(result_dst, op[0], op[1]));
1688 break;
1689
1690 case ir_binop_logic_or:
1691 emit(OR(result_dst, op[0], op[1]));
1692 break;
1693
1694 case ir_binop_logic_and:
1695 emit(AND(result_dst, op[0], op[1]));
1696 break;
1697
1698 case ir_binop_dot:
1699 assert(ir->operands[0]->type->is_vector());
1700 assert(ir->operands[0]->type == ir->operands[1]->type);
1701 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1702 break;
1703
1704 case ir_unop_sqrt:
1705 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1706 break;
1707 case ir_unop_rsq:
1708 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1709 break;
1710
1711 case ir_unop_bitcast_i2f:
1712 case ir_unop_bitcast_u2f:
1713 this->result = op[0];
1714 this->result.type = BRW_REGISTER_TYPE_F;
1715 break;
1716
1717 case ir_unop_bitcast_f2i:
1718 this->result = op[0];
1719 this->result.type = BRW_REGISTER_TYPE_D;
1720 break;
1721
1722 case ir_unop_bitcast_f2u:
1723 this->result = op[0];
1724 this->result.type = BRW_REGISTER_TYPE_UD;
1725 break;
1726
1727 case ir_unop_i2f:
1728 case ir_unop_i2u:
1729 case ir_unop_u2i:
1730 case ir_unop_u2f:
1731 case ir_unop_f2i:
1732 case ir_unop_f2u:
1733 emit(MOV(result_dst, op[0]));
1734 break;
1735 case ir_unop_b2i:
1736 emit(AND(result_dst, op[0], src_reg(1)));
1737 break;
1738 case ir_unop_b2f:
1739 if (devinfo->gen <= 5) {
1740 resolve_bool_comparison(ir->operands[0], &op[0]);
1741 }
1742 op[0].type = BRW_REGISTER_TYPE_D;
1743 result_dst.type = BRW_REGISTER_TYPE_D;
1744 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1745 result_dst.type = BRW_REGISTER_TYPE_F;
1746 break;
1747 case ir_unop_f2b:
1748 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1749 break;
1750 case ir_unop_i2b:
1751 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1752 break;
1753
1754 case ir_unop_trunc:
1755 emit(RNDZ(result_dst, op[0]));
1756 break;
1757 case ir_unop_ceil: {
1758 src_reg tmp = src_reg(this, ir->type);
1759 op[0].negate = !op[0].negate;
1760 emit(RNDD(dst_reg(tmp), op[0]));
1761 tmp.negate = true;
1762 emit(MOV(result_dst, tmp));
1763 }
1764 break;
1765 case ir_unop_floor:
1766 inst = emit(RNDD(result_dst, op[0]));
1767 break;
1768 case ir_unop_fract:
1769 inst = emit(FRC(result_dst, op[0]));
1770 break;
1771 case ir_unop_round_even:
1772 emit(RNDE(result_dst, op[0]));
1773 break;
1774
1775 case ir_binop_min:
1776 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1777 break;
1778 case ir_binop_max:
1779 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1780 break;
1781
1782 case ir_binop_pow:
1783 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1784 break;
1785
1786 case ir_unop_bit_not:
1787 inst = emit(NOT(result_dst, op[0]));
1788 break;
1789 case ir_binop_bit_and:
1790 inst = emit(AND(result_dst, op[0], op[1]));
1791 break;
1792 case ir_binop_bit_xor:
1793 inst = emit(XOR(result_dst, op[0], op[1]));
1794 break;
1795 case ir_binop_bit_or:
1796 inst = emit(OR(result_dst, op[0], op[1]));
1797 break;
1798
1799 case ir_binop_lshift:
1800 inst = emit(SHL(result_dst, op[0], op[1]));
1801 break;
1802
1803 case ir_binop_rshift:
1804 if (ir->type->base_type == GLSL_TYPE_INT)
1805 inst = emit(ASR(result_dst, op[0], op[1]));
1806 else
1807 inst = emit(SHR(result_dst, op[0], op[1]));
1808 break;
1809
1810 case ir_binop_bfm:
1811 emit(BFI1(result_dst, op[0], op[1]));
1812 break;
1813
1814 case ir_binop_ubo_load: {
1815 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1816 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1817 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1818 src_reg offset;
1819
1820 /* Now, load the vector from that offset. */
1821 assert(ir->type->is_vector() || ir->type->is_scalar());
1822
1823 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1824 packed_consts.type = result.type;
1825 src_reg surf_index;
1826
1827 if (const_uniform_block) {
1828 /* The block index is a constant, so just emit the binding table entry
1829 * as an immediate.
1830 */
1831 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1832 const_uniform_block->value.u[0]);
1833 } else {
1834 /* The block index is not a constant. Evaluate the index expression
1835 * per-channel and add the base UBO index; we have to select a value
1836 * from any live channel.
1837 */
1838 surf_index = src_reg(this, glsl_type::uint_type);
1839 emit(ADD(dst_reg(surf_index), op[0],
1840 src_reg(prog_data->base.binding_table.ubo_start)));
1841 emit_uniformize(dst_reg(surf_index), surf_index);
1842
1843 /* Assume this may touch any UBO. It would be nice to provide
1844 * a tighter bound, but the array information is already lowered away.
1845 */
1846 brw_mark_surface_used(&prog_data->base,
1847 prog_data->base.binding_table.ubo_start +
1848 shader_prog->NumUniformBlocks - 1);
1849 }
1850
1851 if (const_offset_ir) {
1852 if (devinfo->gen >= 8) {
1853 /* Store the offset in a GRF so we can send-from-GRF. */
1854 offset = src_reg(this, glsl_type::int_type);
1855 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1856 } else {
1857 /* Immediates are fine on older generations since they'll be moved
1858 * to a (potentially fake) MRF at the generator level.
1859 */
1860 offset = src_reg(const_offset / 16);
1861 }
1862 } else {
1863 offset = src_reg(this, glsl_type::uint_type);
1864 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1865 }
1866
1867 emit_pull_constant_load_reg(dst_reg(packed_consts),
1868 surf_index,
1869 offset,
1870 NULL, NULL /* before_block/inst */);
1871
1872 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1873 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1874 const_offset % 16 / 4,
1875 const_offset % 16 / 4,
1876 const_offset % 16 / 4);
1877
1878 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1879 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1880 emit(CMP(result_dst, packed_consts, src_reg(0u),
1881 BRW_CONDITIONAL_NZ));
1882 } else {
1883 emit(MOV(result_dst, packed_consts));
1884 }
1885 break;
1886 }
1887
1888 case ir_binop_vector_extract:
1889 unreachable("should have been lowered by vec_index_to_cond_assign");
1890
1891 case ir_triop_fma:
1892 op[0] = fix_3src_operand(op[0]);
1893 op[1] = fix_3src_operand(op[1]);
1894 op[2] = fix_3src_operand(op[2]);
1895 /* Note that the instruction's argument order is reversed from GLSL
1896 * and the IR.
1897 */
1898 emit(MAD(result_dst, op[2], op[1], op[0]));
1899 break;
1900
1901 case ir_triop_lrp:
1902 emit_lrp(result_dst, op[0], op[1], op[2]);
1903 break;
1904
1905 case ir_triop_csel:
1906 unreachable("already handled above");
1907 break;
1908
1909 case ir_triop_bfi:
1910 op[0] = fix_3src_operand(op[0]);
1911 op[1] = fix_3src_operand(op[1]);
1912 op[2] = fix_3src_operand(op[2]);
1913 emit(BFI2(result_dst, op[0], op[1], op[2]));
1914 break;
1915
1916 case ir_triop_bitfield_extract:
1917 op[0] = fix_3src_operand(op[0]);
1918 op[1] = fix_3src_operand(op[1]);
1919 op[2] = fix_3src_operand(op[2]);
1920 /* Note that the instruction's argument order is reversed from GLSL
1921 * and the IR.
1922 */
1923 emit(BFE(result_dst, op[2], op[1], op[0]));
1924 break;
1925
1926 case ir_triop_vector_insert:
1927 unreachable("should have been lowered by lower_vector_insert");
1928
1929 case ir_quadop_bitfield_insert:
1930 unreachable("not reached: should be handled by "
1931 "bitfield_insert_to_bfm_bfi\n");
1932
1933 case ir_quadop_vector:
1934 unreachable("not reached: should be handled by lower_quadop_vector");
1935
1936 case ir_unop_pack_half_2x16:
1937 emit_pack_half_2x16(result_dst, op[0]);
1938 break;
1939 case ir_unop_unpack_half_2x16:
1940 emit_unpack_half_2x16(result_dst, op[0]);
1941 break;
1942 case ir_unop_unpack_unorm_4x8:
1943 emit_unpack_unorm_4x8(result_dst, op[0]);
1944 break;
1945 case ir_unop_unpack_snorm_4x8:
1946 emit_unpack_snorm_4x8(result_dst, op[0]);
1947 break;
1948 case ir_unop_pack_unorm_4x8:
1949 emit_pack_unorm_4x8(result_dst, op[0]);
1950 break;
1951 case ir_unop_pack_snorm_4x8:
1952 emit_pack_snorm_4x8(result_dst, op[0]);
1953 break;
1954 case ir_unop_pack_snorm_2x16:
1955 case ir_unop_pack_unorm_2x16:
1956 case ir_unop_unpack_snorm_2x16:
1957 case ir_unop_unpack_unorm_2x16:
1958 unreachable("not reached: should be handled by lower_packing_builtins");
1959 case ir_unop_unpack_half_2x16_split_x:
1960 case ir_unop_unpack_half_2x16_split_y:
1961 case ir_binop_pack_half_2x16_split:
1962 case ir_unop_interpolate_at_centroid:
1963 case ir_binop_interpolate_at_sample:
1964 case ir_binop_interpolate_at_offset:
1965 unreachable("not reached: should not occur in vertex shader");
1966 case ir_binop_ldexp:
1967 unreachable("not reached: should be handled by ldexp_to_arith()");
1968 case ir_unop_d2f:
1969 case ir_unop_f2d:
1970 case ir_unop_d2i:
1971 case ir_unop_i2d:
1972 case ir_unop_d2u:
1973 case ir_unop_u2d:
1974 case ir_unop_d2b:
1975 case ir_unop_pack_double_2x32:
1976 case ir_unop_unpack_double_2x32:
1977 case ir_unop_frexp_sig:
1978 case ir_unop_frexp_exp:
1979 unreachable("fp64 todo");
1980 }
1981 }
1982
1983
1984 void
1985 vec4_visitor::visit(ir_swizzle *ir)
1986 {
1987 /* Note that this is only swizzles in expressions, not those on the left
1988 * hand side of an assignment, which do write masking. See ir_assignment
1989 * for that.
1990 */
1991 const unsigned swz = brw_compose_swizzle(
1992 brw_swizzle_for_size(ir->type->vector_elements),
1993 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1994
1995 ir->val->accept(this);
1996 this->result = swizzle(this->result, swz);
1997 }
1998
1999 void
2000 vec4_visitor::visit(ir_dereference_variable *ir)
2001 {
2002 const struct glsl_type *type = ir->type;
2003 dst_reg *reg = variable_storage(ir->var);
2004
2005 if (!reg) {
2006 fail("Failed to find variable storage for %s\n", ir->var->name);
2007 this->result = src_reg(brw_null_reg());
2008 return;
2009 }
2010
2011 this->result = src_reg(*reg);
2012
2013 /* System values get their swizzle from the dst_reg writemask */
2014 if (ir->var->data.mode == ir_var_system_value)
2015 return;
2016
2017 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2018 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2019 }
2020
2021
2022 int
2023 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2024 {
2025 /* Under normal circumstances array elements are stored consecutively, so
2026 * the stride is equal to the size of the array element.
2027 */
2028 return type_size(ir->type);
2029 }
2030
2031
2032 void
2033 vec4_visitor::visit(ir_dereference_array *ir)
2034 {
2035 ir_constant *constant_index;
2036 src_reg src;
2037 int array_stride = compute_array_stride(ir);
2038
2039 constant_index = ir->array_index->constant_expression_value();
2040
2041 ir->array->accept(this);
2042 src = this->result;
2043
2044 if (constant_index) {
2045 src.reg_offset += constant_index->value.i[0] * array_stride;
2046 } else {
2047 /* Variable index array dereference. It eats the "vec4" of the
2048 * base of the array and an index that offsets the Mesa register
2049 * index.
2050 */
2051 ir->array_index->accept(this);
2052
2053 src_reg index_reg;
2054
2055 if (array_stride == 1) {
2056 index_reg = this->result;
2057 } else {
2058 index_reg = src_reg(this, glsl_type::int_type);
2059
2060 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2061 }
2062
2063 if (src.reladdr) {
2064 src_reg temp = src_reg(this, glsl_type::int_type);
2065
2066 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2067
2068 index_reg = temp;
2069 }
2070
2071 src.reladdr = ralloc(mem_ctx, src_reg);
2072 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2073 }
2074
2075 /* If the type is smaller than a vec4, replicate the last channel out. */
2076 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2077 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2078 else
2079 src.swizzle = BRW_SWIZZLE_NOOP;
2080 src.type = brw_type_for_base_type(ir->type);
2081
2082 this->result = src;
2083 }
2084
2085 void
2086 vec4_visitor::visit(ir_dereference_record *ir)
2087 {
2088 unsigned int i;
2089 const glsl_type *struct_type = ir->record->type;
2090 int offset = 0;
2091
2092 ir->record->accept(this);
2093
2094 for (i = 0; i < struct_type->length; i++) {
2095 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2096 break;
2097 offset += type_size(struct_type->fields.structure[i].type);
2098 }
2099
2100 /* If the type is smaller than a vec4, replicate the last channel out. */
2101 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2102 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2103 else
2104 this->result.swizzle = BRW_SWIZZLE_NOOP;
2105 this->result.type = brw_type_for_base_type(ir->type);
2106
2107 this->result.reg_offset += offset;
2108 }
2109
2110 /**
2111 * We want to be careful in assignment setup to hit the actual storage
2112 * instead of potentially using a temporary like we might with the
2113 * ir_dereference handler.
2114 */
2115 static dst_reg
2116 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2117 {
2118 /* The LHS must be a dereference. If the LHS is a variable indexed array
2119 * access of a vector, it must be separated into a series conditional moves
2120 * before reaching this point (see ir_vec_index_to_cond_assign).
2121 */
2122 assert(ir->as_dereference());
2123 ir_dereference_array *deref_array = ir->as_dereference_array();
2124 if (deref_array) {
2125 assert(!deref_array->array->type->is_vector());
2126 }
2127
2128 /* Use the rvalue deref handler for the most part. We'll ignore
2129 * swizzles in it and write swizzles using writemask, though.
2130 */
2131 ir->accept(v);
2132 return dst_reg(v->result);
2133 }
2134
2135 void
2136 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2137 const struct glsl_type *type,
2138 enum brw_predicate predicate)
2139 {
2140 if (type->base_type == GLSL_TYPE_STRUCT) {
2141 for (unsigned int i = 0; i < type->length; i++) {
2142 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2143 }
2144 return;
2145 }
2146
2147 if (type->is_array()) {
2148 for (unsigned int i = 0; i < type->length; i++) {
2149 emit_block_move(dst, src, type->fields.array, predicate);
2150 }
2151 return;
2152 }
2153
2154 if (type->is_matrix()) {
2155 const struct glsl_type *vec_type;
2156
2157 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2158 type->vector_elements, 1);
2159
2160 for (int i = 0; i < type->matrix_columns; i++) {
2161 emit_block_move(dst, src, vec_type, predicate);
2162 }
2163 return;
2164 }
2165
2166 assert(type->is_scalar() || type->is_vector());
2167
2168 dst->type = brw_type_for_base_type(type);
2169 src->type = dst->type;
2170
2171 dst->writemask = (1 << type->vector_elements) - 1;
2172
2173 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2174
2175 vec4_instruction *inst = emit(MOV(*dst, *src));
2176 inst->predicate = predicate;
2177
2178 dst->reg_offset++;
2179 src->reg_offset++;
2180 }
2181
2182
2183 /* If the RHS processing resulted in an instruction generating a
2184 * temporary value, and it would be easy to rewrite the instruction to
2185 * generate its result right into the LHS instead, do so. This ends
2186 * up reliably removing instructions where it can be tricky to do so
2187 * later without real UD chain information.
2188 */
2189 bool
2190 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2191 dst_reg dst,
2192 src_reg src,
2193 vec4_instruction *pre_rhs_inst,
2194 vec4_instruction *last_rhs_inst)
2195 {
2196 /* This could be supported, but it would take more smarts. */
2197 if (ir->condition)
2198 return false;
2199
2200 if (pre_rhs_inst == last_rhs_inst)
2201 return false; /* No instructions generated to work with. */
2202
2203 /* Make sure the last instruction generated our source reg. */
2204 if (src.file != GRF ||
2205 src.file != last_rhs_inst->dst.file ||
2206 src.reg != last_rhs_inst->dst.reg ||
2207 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2208 src.reladdr ||
2209 src.abs ||
2210 src.negate ||
2211 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2212 return false;
2213
2214 /* Check that that last instruction fully initialized the channels
2215 * we want to use, in the order we want to use them. We could
2216 * potentially reswizzle the operands of many instructions so that
2217 * we could handle out of order channels, but don't yet.
2218 */
2219
2220 for (unsigned i = 0; i < 4; i++) {
2221 if (dst.writemask & (1 << i)) {
2222 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2223 return false;
2224
2225 if (BRW_GET_SWZ(src.swizzle, i) != i)
2226 return false;
2227 }
2228 }
2229
2230 /* Success! Rewrite the instruction. */
2231 last_rhs_inst->dst.file = dst.file;
2232 last_rhs_inst->dst.reg = dst.reg;
2233 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2234 last_rhs_inst->dst.reladdr = dst.reladdr;
2235 last_rhs_inst->dst.writemask &= dst.writemask;
2236
2237 return true;
2238 }
2239
2240 void
2241 vec4_visitor::visit(ir_assignment *ir)
2242 {
2243 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2244 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2245
2246 if (!ir->lhs->type->is_scalar() &&
2247 !ir->lhs->type->is_vector()) {
2248 ir->rhs->accept(this);
2249 src_reg src = this->result;
2250
2251 if (ir->condition) {
2252 emit_bool_to_cond_code(ir->condition, &predicate);
2253 }
2254
2255 /* emit_block_move doesn't account for swizzles in the source register.
2256 * This should be ok, since the source register is a structure or an
2257 * array, and those can't be swizzled. But double-check to be sure.
2258 */
2259 assert(src.swizzle ==
2260 (ir->rhs->type->is_matrix()
2261 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2262 : BRW_SWIZZLE_NOOP));
2263
2264 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2265 return;
2266 }
2267
2268 /* Now we're down to just a scalar/vector with writemasks. */
2269 int i;
2270
2271 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2272 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2273
2274 ir->rhs->accept(this);
2275
2276 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2277
2278 int swizzles[4];
2279 int src_chan = 0;
2280
2281 assert(ir->lhs->type->is_vector() ||
2282 ir->lhs->type->is_scalar());
2283 dst.writemask = ir->write_mask;
2284
2285 /* Swizzle a small RHS vector into the channels being written.
2286 *
2287 * glsl ir treats write_mask as dictating how many channels are
2288 * present on the RHS while in our instructions we need to make
2289 * those channels appear in the slots of the vec4 they're written to.
2290 */
2291 for (int i = 0; i < 4; i++)
2292 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2293
2294 src_reg src = swizzle(this->result,
2295 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2296 swizzles[2], swizzles[3]));
2297
2298 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2299 return;
2300 }
2301
2302 if (ir->condition) {
2303 emit_bool_to_cond_code(ir->condition, &predicate);
2304 }
2305
2306 for (i = 0; i < type_size(ir->lhs->type); i++) {
2307 vec4_instruction *inst = emit(MOV(dst, src));
2308 inst->predicate = predicate;
2309
2310 dst.reg_offset++;
2311 src.reg_offset++;
2312 }
2313 }
2314
2315 void
2316 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2317 {
2318 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2319 foreach_in_list(ir_constant, field_value, &ir->components) {
2320 emit_constant_values(dst, field_value);
2321 }
2322 return;
2323 }
2324
2325 if (ir->type->is_array()) {
2326 for (unsigned int i = 0; i < ir->type->length; i++) {
2327 emit_constant_values(dst, ir->array_elements[i]);
2328 }
2329 return;
2330 }
2331
2332 if (ir->type->is_matrix()) {
2333 for (int i = 0; i < ir->type->matrix_columns; i++) {
2334 float *vec = &ir->value.f[i * ir->type->vector_elements];
2335
2336 for (int j = 0; j < ir->type->vector_elements; j++) {
2337 dst->writemask = 1 << j;
2338 dst->type = BRW_REGISTER_TYPE_F;
2339
2340 emit(MOV(*dst, src_reg(vec[j])));
2341 }
2342 dst->reg_offset++;
2343 }
2344 return;
2345 }
2346
2347 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2348
2349 for (int i = 0; i < ir->type->vector_elements; i++) {
2350 if (!(remaining_writemask & (1 << i)))
2351 continue;
2352
2353 dst->writemask = 1 << i;
2354 dst->type = brw_type_for_base_type(ir->type);
2355
2356 /* Find other components that match the one we're about to
2357 * write. Emits fewer instructions for things like vec4(0.5,
2358 * 1.5, 1.5, 1.5).
2359 */
2360 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2361 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2362 if (ir->value.b[i] == ir->value.b[j])
2363 dst->writemask |= (1 << j);
2364 } else {
2365 /* u, i, and f storage all line up, so no need for a
2366 * switch case for comparing each type.
2367 */
2368 if (ir->value.u[i] == ir->value.u[j])
2369 dst->writemask |= (1 << j);
2370 }
2371 }
2372
2373 switch (ir->type->base_type) {
2374 case GLSL_TYPE_FLOAT:
2375 emit(MOV(*dst, src_reg(ir->value.f[i])));
2376 break;
2377 case GLSL_TYPE_INT:
2378 emit(MOV(*dst, src_reg(ir->value.i[i])));
2379 break;
2380 case GLSL_TYPE_UINT:
2381 emit(MOV(*dst, src_reg(ir->value.u[i])));
2382 break;
2383 case GLSL_TYPE_BOOL:
2384 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2385 break;
2386 default:
2387 unreachable("Non-float/uint/int/bool constant");
2388 }
2389
2390 remaining_writemask &= ~dst->writemask;
2391 }
2392 dst->reg_offset++;
2393 }
2394
2395 void
2396 vec4_visitor::visit(ir_constant *ir)
2397 {
2398 dst_reg dst = dst_reg(this, ir->type);
2399 this->result = src_reg(dst);
2400
2401 emit_constant_values(&dst, ir);
2402 }
2403
2404 void
2405 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2406 {
2407 ir_dereference *deref = static_cast<ir_dereference *>(
2408 ir->actual_parameters.get_head());
2409 ir_variable *location = deref->variable_referenced();
2410 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2411 location->data.binding);
2412
2413 /* Calculate the surface offset */
2414 src_reg offset(this, glsl_type::uint_type);
2415 ir_dereference_array *deref_array = deref->as_dereference_array();
2416 if (deref_array) {
2417 deref_array->array_index->accept(this);
2418
2419 src_reg tmp(this, glsl_type::uint_type);
2420 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2421 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2422 } else {
2423 offset = location->data.atomic.offset;
2424 }
2425
2426 /* Emit the appropriate machine instruction */
2427 const char *callee = ir->callee->function_name();
2428 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2429
2430 if (!strcmp("__intrinsic_atomic_read", callee)) {
2431 emit_untyped_surface_read(surf_index, dst, offset);
2432
2433 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2434 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2435 src_reg(), src_reg());
2436
2437 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2438 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2439 src_reg(), src_reg());
2440 }
2441 }
2442
2443 void
2444 vec4_visitor::visit(ir_call *ir)
2445 {
2446 const char *callee = ir->callee->function_name();
2447
2448 if (!strcmp("__intrinsic_atomic_read", callee) ||
2449 !strcmp("__intrinsic_atomic_increment", callee) ||
2450 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2451 visit_atomic_counter_intrinsic(ir);
2452 } else {
2453 unreachable("Unsupported intrinsic.");
2454 }
2455 }
2456
2457 src_reg
2458 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2459 {
2460 vec4_instruction *inst =
2461 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2462 dst_reg(this, glsl_type::uvec4_type));
2463 inst->base_mrf = 2;
2464 inst->src[1] = sampler;
2465
2466 int param_base;
2467
2468 if (devinfo->gen >= 9) {
2469 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2470 vec4_instruction *header_inst = new(mem_ctx)
2471 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2472 dst_reg(MRF, inst->base_mrf));
2473
2474 emit(header_inst);
2475
2476 inst->mlen = 2;
2477 inst->header_size = 1;
2478 param_base = inst->base_mrf + 1;
2479 } else {
2480 inst->mlen = 1;
2481 param_base = inst->base_mrf;
2482 }
2483
2484 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2485 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2486 int zero_mask = 0xf & ~coord_mask;
2487
2488 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2489 coordinate));
2490
2491 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2492 src_reg(0)));
2493
2494 emit(inst);
2495 return src_reg(inst->dst);
2496 }
2497
2498 static bool
2499 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2500 {
2501 if (devinfo->gen < 8 && !devinfo->is_haswell)
2502 return false;
2503
2504 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2505 }
2506
2507 void
2508 vec4_visitor::visit(ir_texture *ir)
2509 {
2510 uint32_t sampler =
2511 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2512
2513 ir_rvalue *nonconst_sampler_index =
2514 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2515
2516 /* Handle non-constant sampler array indexing */
2517 src_reg sampler_reg;
2518 if (nonconst_sampler_index) {
2519 /* The highest sampler which may be used by this operation is
2520 * the last element of the array. Mark it here, because the generator
2521 * doesn't have enough information to determine the bound.
2522 */
2523 uint32_t array_size = ir->sampler->as_dereference_array()
2524 ->array->type->array_size();
2525
2526 uint32_t max_used = sampler + array_size - 1;
2527 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2528 max_used += prog_data->base.binding_table.gather_texture_start;
2529 } else {
2530 max_used += prog_data->base.binding_table.texture_start;
2531 }
2532
2533 brw_mark_surface_used(&prog_data->base, max_used);
2534
2535 /* Emit code to evaluate the actual indexing expression */
2536 nonconst_sampler_index->accept(this);
2537 dst_reg temp(this, glsl_type::uint_type);
2538 emit(ADD(temp, this->result, src_reg(sampler)));
2539 emit_uniformize(temp, src_reg(temp));
2540
2541 sampler_reg = src_reg(temp);
2542 } else {
2543 /* Single sampler, or constant array index; the indexing expression
2544 * is just an immediate.
2545 */
2546 sampler_reg = src_reg(sampler);
2547 }
2548
2549 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2550 * emitting anything other than setting up the constant result.
2551 */
2552 if (ir->op == ir_tg4) {
2553 ir_constant *chan = ir->lod_info.component->as_constant();
2554 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2555 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2556 dst_reg result(this, ir->type);
2557 this->result = src_reg(result);
2558 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2559 return;
2560 }
2561 }
2562
2563 /* Should be lowered by do_lower_texture_projection */
2564 assert(!ir->projector);
2565
2566 /* Should be lowered */
2567 assert(!ir->offset || !ir->offset->type->is_array());
2568
2569 /* Generate code to compute all the subexpression trees. This has to be
2570 * done before loading any values into MRFs for the sampler message since
2571 * generating these values may involve SEND messages that need the MRFs.
2572 */
2573 src_reg coordinate;
2574 if (ir->coordinate) {
2575 ir->coordinate->accept(this);
2576 coordinate = this->result;
2577 }
2578
2579 src_reg shadow_comparitor;
2580 if (ir->shadow_comparitor) {
2581 ir->shadow_comparitor->accept(this);
2582 shadow_comparitor = this->result;
2583 }
2584
2585 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2586 src_reg offset_value;
2587 if (has_nonconstant_offset) {
2588 ir->offset->accept(this);
2589 offset_value = src_reg(this->result);
2590 }
2591
2592 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2593 src_reg lod, dPdx, dPdy, sample_index, mcs;
2594 switch (ir->op) {
2595 case ir_tex:
2596 lod = src_reg(0.0f);
2597 lod_type = glsl_type::float_type;
2598 break;
2599 case ir_txf:
2600 case ir_txl:
2601 case ir_txs:
2602 ir->lod_info.lod->accept(this);
2603 lod = this->result;
2604 lod_type = ir->lod_info.lod->type;
2605 break;
2606 case ir_query_levels:
2607 lod = src_reg(0);
2608 lod_type = glsl_type::int_type;
2609 break;
2610 case ir_txf_ms:
2611 ir->lod_info.sample_index->accept(this);
2612 sample_index = this->result;
2613 sample_index_type = ir->lod_info.sample_index->type;
2614
2615 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2616 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2617 else
2618 mcs = src_reg(0u);
2619 break;
2620 case ir_txd:
2621 ir->lod_info.grad.dPdx->accept(this);
2622 dPdx = this->result;
2623
2624 ir->lod_info.grad.dPdy->accept(this);
2625 dPdy = this->result;
2626
2627 lod_type = ir->lod_info.grad.dPdx->type;
2628 break;
2629 case ir_txb:
2630 case ir_lod:
2631 case ir_tg4:
2632 break;
2633 }
2634
2635 enum opcode opcode;
2636 switch (ir->op) {
2637 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2638 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2639 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2640 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2641 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2642 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2643 case ir_tg4: opcode = has_nonconstant_offset
2644 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2645 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2646 case ir_txb:
2647 unreachable("TXB is not valid for vertex shaders.");
2648 case ir_lod:
2649 unreachable("LOD is not valid for vertex shaders.");
2650 default:
2651 unreachable("Unrecognized tex op");
2652 }
2653
2654 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2655 opcode, dst_reg(this, ir->type));
2656
2657 if (ir->offset != NULL && !has_nonconstant_offset) {
2658 inst->offset =
2659 brw_texture_offset(ir->offset->as_constant()->value.i,
2660 ir->offset->type->vector_elements);
2661 }
2662
2663 /* Stuff the channel select bits in the top of the texture offset */
2664 if (ir->op == ir_tg4)
2665 inst->offset |= gather_channel(ir, sampler) << 16;
2666
2667 /* The message header is necessary for:
2668 * - Gen4 (always)
2669 * - Gen9+ for selecting SIMD4x2
2670 * - Texel offsets
2671 * - Gather channel selection
2672 * - Sampler indices too large to fit in a 4-bit value.
2673 */
2674 inst->header_size =
2675 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2676 inst->offset != 0 || ir->op == ir_tg4 ||
2677 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2678 inst->base_mrf = 2;
2679 inst->mlen = inst->header_size + 1; /* always at least one */
2680 inst->dst.writemask = WRITEMASK_XYZW;
2681 inst->shadow_compare = ir->shadow_comparitor != NULL;
2682
2683 inst->src[1] = sampler_reg;
2684
2685 /* MRF for the first parameter */
2686 int param_base = inst->base_mrf + inst->header_size;
2687
2688 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2689 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2690 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2691 } else {
2692 /* Load the coordinate */
2693 /* FINISHME: gl_clamp_mask and saturate */
2694 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2695 int zero_mask = 0xf & ~coord_mask;
2696
2697 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2698 coordinate));
2699
2700 if (zero_mask != 0) {
2701 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2702 src_reg(0)));
2703 }
2704 /* Load the shadow comparitor */
2705 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2706 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2707 WRITEMASK_X),
2708 shadow_comparitor));
2709 inst->mlen++;
2710 }
2711
2712 /* Load the LOD info */
2713 if (ir->op == ir_tex || ir->op == ir_txl) {
2714 int mrf, writemask;
2715 if (devinfo->gen >= 5) {
2716 mrf = param_base + 1;
2717 if (ir->shadow_comparitor) {
2718 writemask = WRITEMASK_Y;
2719 /* mlen already incremented */
2720 } else {
2721 writemask = WRITEMASK_X;
2722 inst->mlen++;
2723 }
2724 } else /* devinfo->gen == 4 */ {
2725 mrf = param_base;
2726 writemask = WRITEMASK_W;
2727 }
2728 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2729 } else if (ir->op == ir_txf) {
2730 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2731 } else if (ir->op == ir_txf_ms) {
2732 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2733 sample_index));
2734 if (devinfo->gen >= 7) {
2735 /* MCS data is in the first channel of `mcs`, but we need to get it into
2736 * the .y channel of the second vec4 of params, so replicate .x across
2737 * the whole vec4 and then mask off everything except .y
2738 */
2739 mcs.swizzle = BRW_SWIZZLE_XXXX;
2740 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2741 mcs));
2742 }
2743 inst->mlen++;
2744 } else if (ir->op == ir_txd) {
2745 const glsl_type *type = lod_type;
2746
2747 if (devinfo->gen >= 5) {
2748 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2749 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2750 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2751 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2752 inst->mlen++;
2753
2754 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2755 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2756 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2757 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2758 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2759 inst->mlen++;
2760
2761 if (ir->shadow_comparitor) {
2762 emit(MOV(dst_reg(MRF, param_base + 2,
2763 ir->shadow_comparitor->type, WRITEMASK_Z),
2764 shadow_comparitor));
2765 }
2766 }
2767 } else /* devinfo->gen == 4 */ {
2768 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2769 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2770 inst->mlen += 2;
2771 }
2772 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2773 if (ir->shadow_comparitor) {
2774 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2775 shadow_comparitor));
2776 }
2777
2778 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2779 offset_value));
2780 inst->mlen++;
2781 }
2782 }
2783
2784 emit(inst);
2785
2786 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2787 * spec requires layers.
2788 */
2789 if (ir->op == ir_txs) {
2790 glsl_type const *type = ir->sampler->type;
2791 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2792 type->sampler_array) {
2793 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2794 writemask(inst->dst, WRITEMASK_Z),
2795 src_reg(inst->dst), src_reg(6));
2796 }
2797 }
2798
2799 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2800 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2801 }
2802
2803 swizzle_result(ir, src_reg(inst->dst), sampler);
2804 }
2805
2806 /**
2807 * Apply workarounds for Gen6 gather with UINT/SINT
2808 */
2809 void
2810 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2811 {
2812 if (!wa)
2813 return;
2814
2815 int width = (wa & WA_8BIT) ? 8 : 16;
2816 dst_reg dst_f = dst;
2817 dst_f.type = BRW_REGISTER_TYPE_F;
2818
2819 /* Convert from UNORM to UINT */
2820 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2821 emit(MOV(dst, src_reg(dst_f)));
2822
2823 if (wa & WA_SIGN) {
2824 /* Reinterpret the UINT value as a signed INT value by
2825 * shifting the sign bit into place, then shifting back
2826 * preserving sign.
2827 */
2828 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2829 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2830 }
2831 }
2832
2833 /**
2834 * Set up the gather channel based on the swizzle, for gather4.
2835 */
2836 uint32_t
2837 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2838 {
2839 ir_constant *chan = ir->lod_info.component->as_constant();
2840 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2841 switch (swiz) {
2842 case SWIZZLE_X: return 0;
2843 case SWIZZLE_Y:
2844 /* gather4 sampler is broken for green channel on RG32F --
2845 * we must ask for blue instead.
2846 */
2847 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2848 return 2;
2849 return 1;
2850 case SWIZZLE_Z: return 2;
2851 case SWIZZLE_W: return 3;
2852 default:
2853 unreachable("Not reached"); /* zero, one swizzles handled already */
2854 }
2855 }
2856
2857 void
2858 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2859 {
2860 int s = key->tex.swizzles[sampler];
2861
2862 this->result = src_reg(this, ir->type);
2863 dst_reg swizzled_result(this->result);
2864
2865 if (ir->op == ir_query_levels) {
2866 /* # levels is in .w */
2867 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2868 emit(MOV(swizzled_result, orig_val));
2869 return;
2870 }
2871
2872 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2873 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2874 emit(MOV(swizzled_result, orig_val));
2875 return;
2876 }
2877
2878
2879 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2880 int swizzle[4] = {0};
2881
2882 for (int i = 0; i < 4; i++) {
2883 switch (GET_SWZ(s, i)) {
2884 case SWIZZLE_ZERO:
2885 zero_mask |= (1 << i);
2886 break;
2887 case SWIZZLE_ONE:
2888 one_mask |= (1 << i);
2889 break;
2890 default:
2891 copy_mask |= (1 << i);
2892 swizzle[i] = GET_SWZ(s, i);
2893 break;
2894 }
2895 }
2896
2897 if (copy_mask) {
2898 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2899 swizzled_result.writemask = copy_mask;
2900 emit(MOV(swizzled_result, orig_val));
2901 }
2902
2903 if (zero_mask) {
2904 swizzled_result.writemask = zero_mask;
2905 emit(MOV(swizzled_result, src_reg(0.0f)));
2906 }
2907
2908 if (one_mask) {
2909 swizzled_result.writemask = one_mask;
2910 emit(MOV(swizzled_result, src_reg(1.0f)));
2911 }
2912 }
2913
2914 void
2915 vec4_visitor::visit(ir_return *)
2916 {
2917 unreachable("not reached");
2918 }
2919
2920 void
2921 vec4_visitor::visit(ir_discard *)
2922 {
2923 unreachable("not reached");
2924 }
2925
2926 void
2927 vec4_visitor::visit(ir_if *ir)
2928 {
2929 /* Don't point the annotation at the if statement, because then it plus
2930 * the then and else blocks get printed.
2931 */
2932 this->base_ir = ir->condition;
2933
2934 if (devinfo->gen == 6) {
2935 emit_if_gen6(ir);
2936 } else {
2937 enum brw_predicate predicate;
2938 emit_bool_to_cond_code(ir->condition, &predicate);
2939 emit(IF(predicate));
2940 }
2941
2942 visit_instructions(&ir->then_instructions);
2943
2944 if (!ir->else_instructions.is_empty()) {
2945 this->base_ir = ir->condition;
2946 emit(BRW_OPCODE_ELSE);
2947
2948 visit_instructions(&ir->else_instructions);
2949 }
2950
2951 this->base_ir = ir->condition;
2952 emit(BRW_OPCODE_ENDIF);
2953 }
2954
2955 void
2956 vec4_visitor::visit(ir_emit_vertex *)
2957 {
2958 unreachable("not reached");
2959 }
2960
2961 void
2962 vec4_visitor::visit(ir_end_primitive *)
2963 {
2964 unreachable("not reached");
2965 }
2966
2967 void
2968 vec4_visitor::visit(ir_barrier *)
2969 {
2970 unreachable("not reached");
2971 }
2972
2973 void
2974 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2975 dst_reg dst, src_reg offset,
2976 src_reg src0, src_reg src1)
2977 {
2978 unsigned mlen = 0;
2979
2980 /* Set the atomic operation offset. */
2981 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2982 mlen++;
2983
2984 /* Set the atomic operation arguments. */
2985 if (src0.file != BAD_FILE) {
2986 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2987 mlen++;
2988 }
2989
2990 if (src1.file != BAD_FILE) {
2991 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2992 mlen++;
2993 }
2994
2995 /* Emit the instruction. Note that this maps to the normal SIMD8
2996 * untyped atomic message on Ivy Bridge, but that's OK because
2997 * unused channels will be masked out.
2998 */
2999 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3000 brw_message_reg(0),
3001 src_reg(surf_index), src_reg(atomic_op));
3002 inst->mlen = mlen;
3003 }
3004
3005 void
3006 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3007 src_reg offset)
3008 {
3009 /* Set the surface read offset. */
3010 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3011
3012 /* Emit the instruction. Note that this maps to the normal SIMD8
3013 * untyped surface read message, but that's OK because unused
3014 * channels will be masked out.
3015 */
3016 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3017 brw_message_reg(0),
3018 src_reg(surf_index), src_reg(1));
3019 inst->mlen = 1;
3020 }
3021
3022 void
3023 vec4_visitor::emit_ndc_computation()
3024 {
3025 /* Get the position */
3026 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3027
3028 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3029 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3030 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3031
3032 current_annotation = "NDC";
3033 dst_reg ndc_w = ndc;
3034 ndc_w.writemask = WRITEMASK_W;
3035 src_reg pos_w = pos;
3036 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3037 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3038
3039 dst_reg ndc_xyz = ndc;
3040 ndc_xyz.writemask = WRITEMASK_XYZ;
3041
3042 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3043 }
3044
3045 void
3046 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3047 {
3048 if (devinfo->gen < 6 &&
3049 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3050 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3051 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3052 dst_reg header1_w = header1;
3053 header1_w.writemask = WRITEMASK_W;
3054
3055 emit(MOV(header1, 0u));
3056
3057 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3058 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3059
3060 current_annotation = "Point size";
3061 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3062 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3063 }
3064
3065 if (key->userclip_active) {
3066 current_annotation = "Clipping flags";
3067 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3068 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3069
3070 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3071 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3072 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3073
3074 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3075 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3076 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3077 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3078 }
3079
3080 /* i965 clipping workaround:
3081 * 1) Test for -ve rhw
3082 * 2) If set,
3083 * set ndc = (0,0,0,0)
3084 * set ucp[6] = 1
3085 *
3086 * Later, clipping will detect ucp[6] and ensure the primitive is
3087 * clipped against all fixed planes.
3088 */
3089 if (devinfo->has_negative_rhw_bug) {
3090 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3091 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3092 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3093 vec4_instruction *inst;
3094 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3095 inst->predicate = BRW_PREDICATE_NORMAL;
3096 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3097 inst->predicate = BRW_PREDICATE_NORMAL;
3098 }
3099
3100 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3101 } else if (devinfo->gen < 6) {
3102 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3103 } else {
3104 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3105 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3106 dst_reg reg_w = reg;
3107 reg_w.writemask = WRITEMASK_W;
3108 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3109 }
3110 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3111 dst_reg reg_y = reg;
3112 reg_y.writemask = WRITEMASK_Y;
3113 reg_y.type = BRW_REGISTER_TYPE_D;
3114 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3115 }
3116 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3117 dst_reg reg_z = reg;
3118 reg_z.writemask = WRITEMASK_Z;
3119 reg_z.type = BRW_REGISTER_TYPE_D;
3120 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3121 }
3122 }
3123 }
3124
3125 void
3126 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3127 {
3128 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3129 *
3130 * "If a linked set of shaders forming the vertex stage contains no
3131 * static write to gl_ClipVertex or gl_ClipDistance, but the
3132 * application has requested clipping against user clip planes through
3133 * the API, then the coordinate written to gl_Position is used for
3134 * comparison against the user clip planes."
3135 *
3136 * This function is only called if the shader didn't write to
3137 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3138 * if the user wrote to it; otherwise we use gl_Position.
3139 */
3140 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3141 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3142 clip_vertex = VARYING_SLOT_POS;
3143 }
3144
3145 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3146 ++i) {
3147 reg.writemask = 1 << i;
3148 emit(DP4(reg,
3149 src_reg(output_reg[clip_vertex]),
3150 src_reg(this->userplane[i + offset])));
3151 }
3152 }
3153
3154 vec4_instruction *
3155 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3156 {
3157 assert (varying < VARYING_SLOT_MAX);
3158 reg.type = output_reg[varying].type;
3159 current_annotation = output_reg_annotation[varying];
3160 /* Copy the register, saturating if necessary */
3161 return emit(MOV(reg, src_reg(output_reg[varying])));
3162 }
3163
3164 void
3165 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3166 {
3167 reg.type = BRW_REGISTER_TYPE_F;
3168
3169 switch (varying) {
3170 case VARYING_SLOT_PSIZ:
3171 {
3172 /* PSIZ is always in slot 0, and is coupled with other flags. */
3173 current_annotation = "indices, point width, clip flags";
3174 emit_psiz_and_flags(reg);
3175 break;
3176 }
3177 case BRW_VARYING_SLOT_NDC:
3178 current_annotation = "NDC";
3179 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3180 break;
3181 case VARYING_SLOT_POS:
3182 current_annotation = "gl_Position";
3183 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3184 break;
3185 case VARYING_SLOT_EDGE:
3186 /* This is present when doing unfilled polygons. We're supposed to copy
3187 * the edge flag from the user-provided vertex array
3188 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3189 * of that attribute (starts as 1.0f). This is then used in clipping to
3190 * determine which edges should be drawn as wireframe.
3191 */
3192 current_annotation = "edge flag";
3193 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3194 glsl_type::float_type, WRITEMASK_XYZW))));
3195 break;
3196 case BRW_VARYING_SLOT_PAD:
3197 /* No need to write to this slot */
3198 break;
3199 case VARYING_SLOT_COL0:
3200 case VARYING_SLOT_COL1:
3201 case VARYING_SLOT_BFC0:
3202 case VARYING_SLOT_BFC1: {
3203 /* These built-in varyings are only supported in compatibility mode,
3204 * and we only support GS in core profile. So, this must be a vertex
3205 * shader.
3206 */
3207 assert(stage == MESA_SHADER_VERTEX);
3208 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3209 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3210 inst->saturate = true;
3211 break;
3212 }
3213
3214 default:
3215 emit_generic_urb_slot(reg, varying);
3216 break;
3217 }
3218 }
3219
3220 static int
3221 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3222 {
3223 if (devinfo->gen >= 6) {
3224 /* URB data written (does not include the message header reg) must
3225 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3226 * section 5.4.3.2.2: URB_INTERLEAVED.
3227 *
3228 * URB entries are allocated on a multiple of 1024 bits, so an
3229 * extra 128 bits written here to make the end align to 256 is
3230 * no problem.
3231 */
3232 if ((mlen % 2) != 1)
3233 mlen++;
3234 }
3235
3236 return mlen;
3237 }
3238
3239
3240 /**
3241 * Generates the VUE payload plus the necessary URB write instructions to
3242 * output it.
3243 *
3244 * The VUE layout is documented in Volume 2a.
3245 */
3246 void
3247 vec4_visitor::emit_vertex()
3248 {
3249 /* MRF 0 is reserved for the debugger, so start with message header
3250 * in MRF 1.
3251 */
3252 int base_mrf = 1;
3253 int mrf = base_mrf;
3254 /* In the process of generating our URB write message contents, we
3255 * may need to unspill a register or load from an array. Those
3256 * reads would use MRFs 14-15.
3257 */
3258 int max_usable_mrf = 13;
3259
3260 /* The following assertion verifies that max_usable_mrf causes an
3261 * even-numbered amount of URB write data, which will meet gen6's
3262 * requirements for length alignment.
3263 */
3264 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3265
3266 /* First mrf is the g0-based message header containing URB handles and
3267 * such.
3268 */
3269 emit_urb_write_header(mrf++);
3270
3271 if (devinfo->gen < 6) {
3272 emit_ndc_computation();
3273 }
3274
3275 /* Lower legacy ff and ClipVertex clipping to clip distances */
3276 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3277 current_annotation = "user clip distances";
3278
3279 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3280 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3281
3282 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3283 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3284 }
3285
3286 /* We may need to split this up into several URB writes, so do them in a
3287 * loop.
3288 */
3289 int slot = 0;
3290 bool complete = false;
3291 do {
3292 /* URB offset is in URB row increments, and each of our MRFs is half of
3293 * one of those, since we're doing interleaved writes.
3294 */
3295 int offset = slot / 2;
3296
3297 mrf = base_mrf + 1;
3298 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3299 emit_urb_slot(dst_reg(MRF, mrf++),
3300 prog_data->vue_map.slot_to_varying[slot]);
3301
3302 /* If this was max_usable_mrf, we can't fit anything more into this
3303 * URB WRITE.
3304 */
3305 if (mrf > max_usable_mrf) {
3306 slot++;
3307 break;
3308 }
3309 }
3310
3311 complete = slot >= prog_data->vue_map.num_slots;
3312 current_annotation = "URB write";
3313 vec4_instruction *inst = emit_urb_write_opcode(complete);
3314 inst->base_mrf = base_mrf;
3315 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3316 inst->offset += offset;
3317 } while(!complete);
3318 }
3319
3320
3321 src_reg
3322 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3323 src_reg *reladdr, int reg_offset)
3324 {
3325 /* Because we store the values to scratch interleaved like our
3326 * vertex data, we need to scale the vec4 index by 2.
3327 */
3328 int message_header_scale = 2;
3329
3330 /* Pre-gen6, the message header uses byte offsets instead of vec4
3331 * (16-byte) offset units.
3332 */
3333 if (devinfo->gen < 6)
3334 message_header_scale *= 16;
3335
3336 if (reladdr) {
3337 src_reg index = src_reg(this, glsl_type::int_type);
3338
3339 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3340 src_reg(reg_offset)));
3341 emit_before(block, inst, MUL(dst_reg(index), index,
3342 src_reg(message_header_scale)));
3343
3344 return index;
3345 } else {
3346 return src_reg(reg_offset * message_header_scale);
3347 }
3348 }
3349
3350 src_reg
3351 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3352 src_reg *reladdr, int reg_offset)
3353 {
3354 if (reladdr) {
3355 src_reg index = src_reg(this, glsl_type::int_type);
3356
3357 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3358 src_reg(reg_offset)));
3359
3360 /* Pre-gen6, the message header uses byte offsets instead of vec4
3361 * (16-byte) offset units.
3362 */
3363 if (devinfo->gen < 6) {
3364 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3365 }
3366
3367 return index;
3368 } else if (devinfo->gen >= 8) {
3369 /* Store the offset in a GRF so we can send-from-GRF. */
3370 src_reg offset = src_reg(this, glsl_type::int_type);
3371 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3372 return offset;
3373 } else {
3374 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3375 return src_reg(reg_offset * message_header_scale);
3376 }
3377 }
3378
3379 /**
3380 * Emits an instruction before @inst to load the value named by @orig_src
3381 * from scratch space at @base_offset to @temp.
3382 *
3383 * @base_offset is measured in 32-byte units (the size of a register).
3384 */
3385 void
3386 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3387 dst_reg temp, src_reg orig_src,
3388 int base_offset)
3389 {
3390 int reg_offset = base_offset + orig_src.reg_offset;
3391 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3392 reg_offset);
3393
3394 emit_before(block, inst, SCRATCH_READ(temp, index));
3395 }
3396
3397 /**
3398 * Emits an instruction after @inst to store the value to be written
3399 * to @orig_dst to scratch space at @base_offset, from @temp.
3400 *
3401 * @base_offset is measured in 32-byte units (the size of a register).
3402 */
3403 void
3404 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3405 int base_offset)
3406 {
3407 int reg_offset = base_offset + inst->dst.reg_offset;
3408 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3409 reg_offset);
3410
3411 /* Create a temporary register to store *inst's result in.
3412 *
3413 * We have to be careful in MOVing from our temporary result register in
3414 * the scratch write. If we swizzle from channels of the temporary that
3415 * weren't initialized, it will confuse live interval analysis, which will
3416 * make spilling fail to make progress.
3417 */
3418 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3419 inst->dst.type),
3420 brw_swizzle_for_mask(inst->dst.writemask));
3421 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3422 inst->dst.writemask));
3423 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3424 write->predicate = inst->predicate;
3425 write->ir = inst->ir;
3426 write->annotation = inst->annotation;
3427 inst->insert_after(block, write);
3428
3429 inst->dst.file = temp.file;
3430 inst->dst.reg = temp.reg;
3431 inst->dst.reg_offset = temp.reg_offset;
3432 inst->dst.reladdr = NULL;
3433 }
3434
3435 /**
3436 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3437 * adds the scratch read(s) before \p inst. The function also checks for
3438 * recursive reladdr scratch accesses, issuing the corresponding scratch
3439 * loads and rewriting reladdr references accordingly.
3440 *
3441 * \return \p src if it did not require a scratch load, otherwise, the
3442 * register holding the result of the scratch load that the caller should
3443 * use to rewrite src.
3444 */
3445 src_reg
3446 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3447 vec4_instruction *inst, src_reg src)
3448 {
3449 /* Resolve recursive reladdr scratch access by calling ourselves
3450 * with src.reladdr
3451 */
3452 if (src.reladdr)
3453 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3454 *src.reladdr);
3455
3456 /* Now handle scratch access on src */
3457 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3458 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3459 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3460 src.reg = temp.reg;
3461 src.reg_offset = temp.reg_offset;
3462 src.reladdr = NULL;
3463 }
3464
3465 return src;
3466 }
3467
3468 /**
3469 * We can't generally support array access in GRF space, because a
3470 * single instruction's destination can only span 2 contiguous
3471 * registers. So, we send all GRF arrays that get variable index
3472 * access to scratch space.
3473 */
3474 void
3475 vec4_visitor::move_grf_array_access_to_scratch()
3476 {
3477 int scratch_loc[this->alloc.count];
3478 memset(scratch_loc, -1, sizeof(scratch_loc));
3479
3480 /* First, calculate the set of virtual GRFs that need to be punted
3481 * to scratch due to having any array access on them, and where in
3482 * scratch.
3483 */
3484 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3485 if (inst->dst.file == GRF && inst->dst.reladdr) {
3486 if (scratch_loc[inst->dst.reg] == -1) {
3487 scratch_loc[inst->dst.reg] = c->last_scratch;
3488 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3489 }
3490
3491 for (src_reg *iter = inst->dst.reladdr;
3492 iter->reladdr;
3493 iter = iter->reladdr) {
3494 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3495 scratch_loc[iter->reg] = c->last_scratch;
3496 c->last_scratch += this->alloc.sizes[iter->reg];
3497 }
3498 }
3499 }
3500
3501 for (int i = 0 ; i < 3; i++) {
3502 for (src_reg *iter = &inst->src[i];
3503 iter->reladdr;
3504 iter = iter->reladdr) {
3505 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3506 scratch_loc[iter->reg] = c->last_scratch;
3507 c->last_scratch += this->alloc.sizes[iter->reg];
3508 }
3509 }
3510 }
3511 }
3512
3513 /* Now, for anything that will be accessed through scratch, rewrite
3514 * it to load/store. Note that this is a _safe list walk, because
3515 * we may generate a new scratch_write instruction after the one
3516 * we're processing.
3517 */
3518 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3519 /* Set up the annotation tracking for new generated instructions. */
3520 base_ir = inst->ir;
3521 current_annotation = inst->annotation;
3522
3523 /* First handle scratch access on the dst. Notice we have to handle
3524 * the case where the dst's reladdr also points to scratch space.
3525 */
3526 if (inst->dst.reladdr)
3527 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3528 *inst->dst.reladdr);
3529
3530 /* Now that we have handled any (possibly recursive) reladdr scratch
3531 * accesses for dst we can safely do the scratch write for dst itself
3532 */
3533 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3534 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3535
3536 /* Now handle scratch access on any src. In this case, since inst->src[i]
3537 * already is a src_reg, we can just call emit_resolve_reladdr with
3538 * inst->src[i] and it will take care of handling scratch loads for
3539 * both src and src.reladdr (recursively).
3540 */
3541 for (int i = 0 ; i < 3; i++) {
3542 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3543 inst->src[i]);
3544 }
3545 }
3546 }
3547
3548 /**
3549 * Emits an instruction before @inst to load the value named by @orig_src
3550 * from the pull constant buffer (surface) at @base_offset to @temp.
3551 */
3552 void
3553 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3554 dst_reg temp, src_reg orig_src,
3555 int base_offset)
3556 {
3557 int reg_offset = base_offset + orig_src.reg_offset;
3558 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3559 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3560 reg_offset);
3561
3562 emit_pull_constant_load_reg(temp,
3563 index,
3564 offset,
3565 block, inst);
3566 }
3567
3568 /**
3569 * Implements array access of uniforms by inserting a
3570 * PULL_CONSTANT_LOAD instruction.
3571 *
3572 * Unlike temporary GRF array access (where we don't support it due to
3573 * the difficulty of doing relative addressing on instruction
3574 * destinations), we could potentially do array access of uniforms
3575 * that were loaded in GRF space as push constants. In real-world
3576 * usage we've seen, though, the arrays being used are always larger
3577 * than we could load as push constants, so just always move all
3578 * uniform array access out to a pull constant buffer.
3579 */
3580 void
3581 vec4_visitor::move_uniform_array_access_to_pull_constants()
3582 {
3583 int pull_constant_loc[this->uniforms];
3584 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3585 bool nested_reladdr;
3586
3587 /* Walk through and find array access of uniforms. Put a copy of that
3588 * uniform in the pull constant buffer.
3589 *
3590 * Note that we don't move constant-indexed accesses to arrays. No
3591 * testing has been done of the performance impact of this choice.
3592 */
3593 do {
3594 nested_reladdr = false;
3595
3596 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3597 for (int i = 0 ; i < 3; i++) {
3598 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3599 continue;
3600
3601 int uniform = inst->src[i].reg;
3602
3603 if (inst->src[i].reladdr->reladdr)
3604 nested_reladdr = true; /* will need another pass */
3605
3606 /* If this array isn't already present in the pull constant buffer,
3607 * add it.
3608 */
3609 if (pull_constant_loc[uniform] == -1) {
3610 const gl_constant_value **values =
3611 &stage_prog_data->param[uniform * 4];
3612
3613 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3614
3615 assert(uniform < uniform_array_size);
3616 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3617 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3618 = values[j];
3619 }
3620 }
3621
3622 /* Set up the annotation tracking for new generated instructions. */
3623 base_ir = inst->ir;
3624 current_annotation = inst->annotation;
3625
3626 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3627
3628 emit_pull_constant_load(block, inst, temp, inst->src[i],
3629 pull_constant_loc[uniform]);
3630
3631 inst->src[i].file = temp.file;
3632 inst->src[i].reg = temp.reg;
3633 inst->src[i].reg_offset = temp.reg_offset;
3634 inst->src[i].reladdr = NULL;
3635 }
3636 }
3637 } while (nested_reladdr);
3638
3639 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3640 * no need to track them as larger-than-vec4 objects. This will be
3641 * relied on in cutting out unused uniform vectors from push
3642 * constants.
3643 */
3644 split_uniform_registers();
3645 }
3646
3647 void
3648 vec4_visitor::resolve_ud_negate(src_reg *reg)
3649 {
3650 if (reg->type != BRW_REGISTER_TYPE_UD ||
3651 !reg->negate)
3652 return;
3653
3654 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3655 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3656 *reg = temp;
3657 }
3658
3659 /**
3660 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3661 *
3662 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3663 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3664 */
3665 void
3666 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3667 {
3668 assert(devinfo->gen <= 5);
3669
3670 if (!rvalue->type->is_boolean())
3671 return;
3672
3673 src_reg and_result = src_reg(this, rvalue->type);
3674 src_reg neg_result = src_reg(this, rvalue->type);
3675 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3676 emit(MOV(dst_reg(neg_result), negate(and_result)));
3677 *reg = neg_result;
3678 }
3679
3680 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3681 void *log_data,
3682 struct brw_vec4_compile *c,
3683 struct gl_program *prog,
3684 const struct brw_vue_prog_key *key,
3685 struct brw_vue_prog_data *prog_data,
3686 struct gl_shader_program *shader_prog,
3687 gl_shader_stage stage,
3688 void *mem_ctx,
3689 bool no_spills,
3690 int shader_time_index)
3691 : backend_shader(compiler, log_data, mem_ctx,
3692 shader_prog, prog, &prog_data->base, stage),
3693 c(c),
3694 key(key),
3695 prog_data(prog_data),
3696 sanity_param_count(0),
3697 fail_msg(NULL),
3698 first_non_payload_grf(0),
3699 need_all_constants_in_pull_buffer(false),
3700 no_spills(no_spills),
3701 shader_time_index(shader_time_index)
3702 {
3703 this->failed = false;
3704
3705 this->base_ir = NULL;
3706 this->current_annotation = NULL;
3707 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3708
3709 this->variable_ht = hash_table_ctor(0,
3710 hash_table_pointer_hash,
3711 hash_table_pointer_compare);
3712
3713 this->virtual_grf_start = NULL;
3714 this->virtual_grf_end = NULL;
3715 this->live_intervals = NULL;
3716
3717 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3718
3719 this->uniforms = 0;
3720
3721 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3722 * at least one. See setup_uniforms() in brw_vec4.cpp.
3723 */
3724 this->uniform_array_size = 1;
3725 if (prog_data) {
3726 this->uniform_array_size =
3727 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3728 }
3729
3730 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3731 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3732 }
3733
3734 vec4_visitor::~vec4_visitor()
3735 {
3736 hash_table_dtor(this->variable_ht);
3737 }
3738
3739
3740 void
3741 vec4_visitor::fail(const char *format, ...)
3742 {
3743 va_list va;
3744 char *msg;
3745
3746 if (failed)
3747 return;
3748
3749 failed = true;
3750
3751 va_start(va, format);
3752 msg = ralloc_vasprintf(mem_ctx, format, va);
3753 va_end(va);
3754 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3755
3756 this->fail_msg = msg;
3757
3758 if (debug_enabled) {
3759 fprintf(stderr, "%s", msg);
3760 }
3761 }
3762
3763 } /* namespace brw */