i965: add support for ARB_shader_subroutine
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SUBROUTINE:
607 return 1;
608
609 case GLSL_TYPE_SAMPLER:
610 /* Samplers take up no register space, since they're baked in at
611 * link time.
612 */
613 return 0;
614 case GLSL_TYPE_ATOMIC_UINT:
615 return 0;
616 case GLSL_TYPE_IMAGE:
617 case GLSL_TYPE_VOID:
618 case GLSL_TYPE_DOUBLE:
619 case GLSL_TYPE_ERROR:
620 case GLSL_TYPE_INTERFACE:
621 unreachable("not reached");
622 }
623
624 return 0;
625 }
626
627 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
628 {
629 init();
630
631 this->file = GRF;
632 this->reg = v->alloc.allocate(type_size(type));
633
634 if (type->is_array() || type->is_record()) {
635 this->swizzle = BRW_SWIZZLE_NOOP;
636 } else {
637 this->swizzle = brw_swizzle_for_size(type->vector_elements);
638 }
639
640 this->type = brw_type_for_base_type(type);
641 }
642
643 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
644 {
645 assert(size > 0);
646
647 init();
648
649 this->file = GRF;
650 this->reg = v->alloc.allocate(type_size(type) * size);
651
652 this->swizzle = BRW_SWIZZLE_NOOP;
653
654 this->type = brw_type_for_base_type(type);
655 }
656
657 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
658 {
659 init();
660
661 this->file = GRF;
662 this->reg = v->alloc.allocate(type_size(type));
663
664 if (type->is_array() || type->is_record()) {
665 this->writemask = WRITEMASK_XYZW;
666 } else {
667 this->writemask = (1 << type->vector_elements) - 1;
668 }
669
670 this->type = brw_type_for_base_type(type);
671 }
672
673 /* Our support for uniforms is piggy-backed on the struct
674 * gl_fragment_program, because that's where the values actually
675 * get stored, rather than in some global gl_shader_program uniform
676 * store.
677 */
678 void
679 vec4_visitor::setup_uniform_values(ir_variable *ir)
680 {
681 int namelen = strlen(ir->name);
682
683 /* The data for our (non-builtin) uniforms is stored in a series of
684 * gl_uniform_driver_storage structs for each subcomponent that
685 * glGetUniformLocation() could name. We know it's been set up in the same
686 * order we'd walk the type, so walk the list of storage and find anything
687 * with our name, or the prefix of a component that starts with our name.
688 */
689 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
690 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
691
692 if (storage->builtin)
693 continue;
694
695 if (strncmp(ir->name, storage->name, namelen) != 0 ||
696 (storage->name[namelen] != 0 &&
697 storage->name[namelen] != '.' &&
698 storage->name[namelen] != '[')) {
699 continue;
700 }
701
702 gl_constant_value *components = storage->storage;
703 unsigned vector_count = (MAX2(storage->array_elements, 1) *
704 storage->type->matrix_columns);
705
706 for (unsigned s = 0; s < vector_count; s++) {
707 assert(uniforms < uniform_array_size);
708 uniform_vector_size[uniforms] = storage->type->vector_elements;
709
710 int i;
711 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
712 stage_prog_data->param[uniforms * 4 + i] = components;
713 components++;
714 }
715 for (; i < 4; i++) {
716 static gl_constant_value zero = { 0.0 };
717 stage_prog_data->param[uniforms * 4 + i] = &zero;
718 }
719
720 uniforms++;
721 }
722 }
723 }
724
725 void
726 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
727 {
728 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
729 assert(this->uniforms < uniform_array_size);
730 this->uniform_vector_size[this->uniforms] = 4;
731 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
732 this->userplane[i].type = BRW_REGISTER_TYPE_F;
733 for (int j = 0; j < 4; ++j) {
734 stage_prog_data->param[this->uniforms * 4 + j] =
735 (gl_constant_value *) &clip_planes[i][j];
736 }
737 ++this->uniforms;
738 }
739 }
740
741 /* Our support for builtin uniforms is even scarier than non-builtin.
742 * It sits on top of the PROG_STATE_VAR parameters that are
743 * automatically updated from GL context state.
744 */
745 void
746 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
747 {
748 const ir_state_slot *const slots = ir->get_state_slots();
749 assert(slots != NULL);
750
751 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
752 /* This state reference has already been setup by ir_to_mesa,
753 * but we'll get the same index back here. We can reference
754 * ParameterValues directly, since unlike brw_fs.cpp, we never
755 * add new state references during compile.
756 */
757 int index = _mesa_add_state_reference(this->prog->Parameters,
758 (gl_state_index *)slots[i].tokens);
759 gl_constant_value *values =
760 &this->prog->Parameters->ParameterValues[index][0];
761
762 assert(this->uniforms < uniform_array_size);
763
764 for (unsigned j = 0; j < 4; j++)
765 stage_prog_data->param[this->uniforms * 4 + j] =
766 &values[GET_SWZ(slots[i].swizzle, j)];
767
768 this->uniform_vector_size[this->uniforms] =
769 (ir->type->is_scalar() || ir->type->is_vector() ||
770 ir->type->is_matrix() ? ir->type->vector_elements : 4);
771
772 this->uniforms++;
773 }
774 }
775
776 dst_reg *
777 vec4_visitor::variable_storage(ir_variable *var)
778 {
779 return (dst_reg *)hash_table_find(this->variable_ht, var);
780 }
781
782 void
783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
784 enum brw_predicate *predicate)
785 {
786 ir_expression *expr = ir->as_expression();
787
788 *predicate = BRW_PREDICATE_NORMAL;
789
790 if (expr && expr->operation != ir_binop_ubo_load) {
791 src_reg op[3];
792 vec4_instruction *inst;
793
794 assert(expr->get_num_operands() <= 3);
795 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
796 expr->operands[i]->accept(this);
797 op[i] = this->result;
798
799 resolve_ud_negate(&op[i]);
800 }
801
802 switch (expr->operation) {
803 case ir_unop_logic_not:
804 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
805 inst->conditional_mod = BRW_CONDITIONAL_Z;
806 break;
807
808 case ir_binop_logic_xor:
809 if (devinfo->gen <= 5) {
810 src_reg temp = src_reg(this, ir->type);
811 emit(XOR(dst_reg(temp), op[0], op[1]));
812 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
813 } else {
814 inst = emit(XOR(dst_null_d(), op[0], op[1]));
815 }
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 break;
818
819 case ir_binop_logic_or:
820 if (devinfo->gen <= 5) {
821 src_reg temp = src_reg(this, ir->type);
822 emit(OR(dst_reg(temp), op[0], op[1]));
823 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
824 } else {
825 inst = emit(OR(dst_null_d(), op[0], op[1]));
826 }
827 inst->conditional_mod = BRW_CONDITIONAL_NZ;
828 break;
829
830 case ir_binop_logic_and:
831 if (devinfo->gen <= 5) {
832 src_reg temp = src_reg(this, ir->type);
833 emit(AND(dst_reg(temp), op[0], op[1]));
834 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
835 } else {
836 inst = emit(AND(dst_null_d(), op[0], op[1]));
837 }
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 break;
840
841 case ir_unop_f2b:
842 if (devinfo->gen >= 6) {
843 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
844 } else {
845 inst = emit(MOV(dst_null_f(), op[0]));
846 inst->conditional_mod = BRW_CONDITIONAL_NZ;
847 }
848 break;
849
850 case ir_unop_i2b:
851 if (devinfo->gen >= 6) {
852 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
853 } else {
854 inst = emit(MOV(dst_null_d(), op[0]));
855 inst->conditional_mod = BRW_CONDITIONAL_NZ;
856 }
857 break;
858
859 case ir_binop_all_equal:
860 if (devinfo->gen <= 5) {
861 resolve_bool_comparison(expr->operands[0], &op[0]);
862 resolve_bool_comparison(expr->operands[1], &op[1]);
863 }
864 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
865 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
866 break;
867
868 case ir_binop_any_nequal:
869 if (devinfo->gen <= 5) {
870 resolve_bool_comparison(expr->operands[0], &op[0]);
871 resolve_bool_comparison(expr->operands[1], &op[1]);
872 }
873 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
874 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
875 break;
876
877 case ir_unop_any:
878 if (devinfo->gen <= 5) {
879 resolve_bool_comparison(expr->operands[0], &op[0]);
880 }
881 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
882 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
883 break;
884
885 case ir_binop_greater:
886 case ir_binop_gequal:
887 case ir_binop_less:
888 case ir_binop_lequal:
889 case ir_binop_equal:
890 case ir_binop_nequal:
891 if (devinfo->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 resolve_bool_comparison(expr->operands[1], &op[1]);
894 }
895 emit(CMP(dst_null_d(), op[0], op[1],
896 brw_conditional_for_comparison(expr->operation)));
897 break;
898
899 case ir_triop_csel: {
900 /* Expand the boolean condition into the flag register. */
901 inst = emit(MOV(dst_null_d(), op[0]));
902 inst->conditional_mod = BRW_CONDITIONAL_NZ;
903
904 /* Select which boolean to return. */
905 dst_reg temp(this, expr->operands[1]->type);
906 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
907 inst->predicate = BRW_PREDICATE_NORMAL;
908
909 /* Expand the result to a condition code. */
910 inst = emit(MOV(dst_null_d(), src_reg(temp)));
911 inst->conditional_mod = BRW_CONDITIONAL_NZ;
912 break;
913 }
914
915 default:
916 unreachable("not reached");
917 }
918 return;
919 }
920
921 ir->accept(this);
922
923 resolve_ud_negate(&this->result);
924
925 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
926 inst->conditional_mod = BRW_CONDITIONAL_NZ;
927 }
928
929 /**
930 * Emit a gen6 IF statement with the comparison folded into the IF
931 * instruction.
932 */
933 void
934 vec4_visitor::emit_if_gen6(ir_if *ir)
935 {
936 ir_expression *expr = ir->condition->as_expression();
937
938 if (expr && expr->operation != ir_binop_ubo_load) {
939 src_reg op[3];
940 dst_reg temp;
941
942 assert(expr->get_num_operands() <= 3);
943 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
944 expr->operands[i]->accept(this);
945 op[i] = this->result;
946 }
947
948 switch (expr->operation) {
949 case ir_unop_logic_not:
950 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
951 return;
952
953 case ir_binop_logic_xor:
954 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
955 return;
956
957 case ir_binop_logic_or:
958 temp = dst_reg(this, glsl_type::bool_type);
959 emit(OR(temp, op[0], op[1]));
960 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
961 return;
962
963 case ir_binop_logic_and:
964 temp = dst_reg(this, glsl_type::bool_type);
965 emit(AND(temp, op[0], op[1]));
966 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_f2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_unop_i2b:
974 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
975 return;
976
977 case ir_binop_greater:
978 case ir_binop_gequal:
979 case ir_binop_less:
980 case ir_binop_lequal:
981 case ir_binop_equal:
982 case ir_binop_nequal:
983 emit(IF(op[0], op[1],
984 brw_conditional_for_comparison(expr->operation)));
985 return;
986
987 case ir_binop_all_equal:
988 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
989 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
990 return;
991
992 case ir_binop_any_nequal:
993 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
994 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
995 return;
996
997 case ir_unop_any:
998 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
999 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000 return;
1001
1002 case ir_triop_csel: {
1003 /* Expand the boolean condition into the flag register. */
1004 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007 /* Select which boolean to return. */
1008 dst_reg temp(this, expr->operands[1]->type);
1009 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010 inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013 return;
1014 }
1015
1016 default:
1017 unreachable("not reached");
1018 }
1019 return;
1020 }
1021
1022 ir->condition->accept(this);
1023
1024 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030 dst_reg *reg = NULL;
1031
1032 if (variable_storage(ir))
1033 return;
1034
1035 switch (ir->data.mode) {
1036 case ir_var_shader_in:
1037 assert(ir->data.location != -1);
1038 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039 break;
1040
1041 case ir_var_shader_out:
1042 assert(ir->data.location != -1);
1043 reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045 for (int i = 0; i < type_size(ir->type); i++) {
1046 output_reg[ir->data.location + i] = *reg;
1047 output_reg[ir->data.location + i].reg_offset = i;
1048 output_reg[ir->data.location + i].type =
1049 brw_type_for_base_type(ir->type->get_scalar_type());
1050 output_reg_annotation[ir->data.location + i] = ir->name;
1051 }
1052 break;
1053
1054 case ir_var_auto:
1055 case ir_var_temporary:
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057 break;
1058
1059 case ir_var_uniform:
1060 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062 /* Thanks to the lower_ubo_reference pass, we will see only
1063 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064 * variables, so no need for them to be in variable_ht.
1065 *
1066 * Some uniforms, such as samplers and atomic counters, have no actual
1067 * storage, so we should ignore them.
1068 */
1069 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1070 return;
1071
1072 /* Track how big the whole uniform variable is, in case we need to put a
1073 * copy of its data into pull constants for array access.
1074 */
1075 assert(this->uniforms < uniform_array_size);
1076 this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078 if (!strncmp(ir->name, "gl_", 3)) {
1079 setup_builtin_uniform_values(ir);
1080 } else {
1081 setup_uniform_values(ir);
1082 }
1083 break;
1084
1085 case ir_var_system_value:
1086 reg = make_reg_for_system_value(ir);
1087 break;
1088
1089 default:
1090 unreachable("not reached");
1091 }
1092
1093 reg->type = brw_type_for_base_type(ir->type);
1094 hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100 /* We don't want debugging output to print the whole body of the
1101 * loop as the annotation.
1102 */
1103 this->base_ir = NULL;
1104
1105 emit(BRW_OPCODE_DO);
1106
1107 visit_instructions(&ir->body_instructions);
1108
1109 emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115 switch (ir->mode) {
1116 case ir_loop_jump::jump_break:
1117 emit(BRW_OPCODE_BREAK);
1118 break;
1119 case ir_loop_jump::jump_continue:
1120 emit(BRW_OPCODE_CONTINUE);
1121 break;
1122 }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129 unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135 /* Ignore function bodies other than main() -- we shouldn't see calls to
1136 * them since they should all be inlined.
1137 */
1138 if (strcmp(ir->name, "main") == 0) {
1139 const ir_function_signature *sig;
1140 exec_list empty;
1141
1142 sig = ir->matching_signature(NULL, &empty, false);
1143
1144 assert(sig);
1145
1146 visit_instructions(&sig->body);
1147 }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153 /* 3-src instructions were introduced in gen6. */
1154 if (devinfo->gen < 6)
1155 return false;
1156
1157 /* MAD can only handle floating-point data. */
1158 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159 return false;
1160
1161 ir_rvalue *nonmul;
1162 ir_expression *mul;
1163 bool mul_negate, mul_abs;
1164
1165 for (int i = 0; i < 2; i++) {
1166 mul_negate = false;
1167 mul_abs = false;
1168
1169 mul = ir->operands[i]->as_expression();
1170 nonmul = ir->operands[1 - i];
1171
1172 if (mul && mul->operation == ir_unop_abs) {
1173 mul = mul->operands[0]->as_expression();
1174 mul_abs = true;
1175 } else if (mul && mul->operation == ir_unop_neg) {
1176 mul = mul->operands[0]->as_expression();
1177 mul_negate = true;
1178 }
1179
1180 if (mul && mul->operation == ir_binop_mul)
1181 break;
1182 }
1183
1184 if (!mul || mul->operation != ir_binop_mul)
1185 return false;
1186
1187 nonmul->accept(this);
1188 src_reg src0 = fix_3src_operand(this->result);
1189
1190 mul->operands[0]->accept(this);
1191 src_reg src1 = fix_3src_operand(this->result);
1192 src1.negate ^= mul_negate;
1193 src1.abs = mul_abs;
1194 if (mul_abs)
1195 src1.negate = false;
1196
1197 mul->operands[1]->accept(this);
1198 src_reg src2 = fix_3src_operand(this->result);
1199 src2.abs = mul_abs;
1200 if (mul_abs)
1201 src2.negate = false;
1202
1203 this->result = src_reg(this, ir->type);
1204 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1205
1206 return true;
1207 }
1208
1209 bool
1210 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1211 {
1212 /* This optimization relies on CMP setting the destination to 0 when
1213 * false. Early hardware only sets the least significant bit, and
1214 * leaves the other bits undefined. So we can't use it.
1215 */
1216 if (devinfo->gen < 6)
1217 return false;
1218
1219 ir_expression *const cmp = ir->operands[0]->as_expression();
1220
1221 if (cmp == NULL)
1222 return false;
1223
1224 switch (cmp->operation) {
1225 case ir_binop_less:
1226 case ir_binop_greater:
1227 case ir_binop_lequal:
1228 case ir_binop_gequal:
1229 case ir_binop_equal:
1230 case ir_binop_nequal:
1231 break;
1232
1233 default:
1234 return false;
1235 }
1236
1237 cmp->operands[0]->accept(this);
1238 const src_reg cmp_src0 = this->result;
1239
1240 cmp->operands[1]->accept(this);
1241 const src_reg cmp_src1 = this->result;
1242
1243 this->result = src_reg(this, ir->type);
1244
1245 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1246 brw_conditional_for_comparison(cmp->operation)));
1247
1248 /* If the comparison is false, this->result will just happen to be zero.
1249 */
1250 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1251 this->result, src_reg(1.0f));
1252 inst->predicate = BRW_PREDICATE_NORMAL;
1253 inst->predicate_inverse = true;
1254
1255 return true;
1256 }
1257
1258 void
1259 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1260 src_reg src0, src_reg src1)
1261 {
1262 vec4_instruction *inst;
1263
1264 if (devinfo->gen >= 6) {
1265 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1266 inst->conditional_mod = conditionalmod;
1267 } else {
1268 emit(CMP(dst, src0, src1, conditionalmod));
1269
1270 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1271 inst->predicate = BRW_PREDICATE_NORMAL;
1272 }
1273 }
1274
1275 void
1276 vec4_visitor::emit_lrp(const dst_reg &dst,
1277 const src_reg &x, const src_reg &y, const src_reg &a)
1278 {
1279 if (devinfo->gen >= 6) {
1280 /* Note that the instruction's argument order is reversed from GLSL
1281 * and the IR.
1282 */
1283 emit(LRP(dst,
1284 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1285 } else {
1286 /* Earlier generations don't support three source operations, so we
1287 * need to emit x*(1-a) + y*a.
1288 */
1289 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1290 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1291 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1292 y_times_a.writemask = dst.writemask;
1293 one_minus_a.writemask = dst.writemask;
1294 x_times_one_minus_a.writemask = dst.writemask;
1295
1296 emit(MUL(y_times_a, y, a));
1297 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1298 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1299 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1300 }
1301 }
1302
1303 /**
1304 * Emits the instructions needed to perform a pull constant load. before_block
1305 * and before_inst can be NULL in which case the instruction will be appended
1306 * to the end of the instruction list.
1307 */
1308 void
1309 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1310 src_reg surf_index,
1311 src_reg offset_reg,
1312 bblock_t *before_block,
1313 vec4_instruction *before_inst)
1314 {
1315 assert((before_inst == NULL && before_block == NULL) ||
1316 (before_inst && before_block));
1317
1318 vec4_instruction *pull;
1319
1320 if (devinfo->gen >= 9) {
1321 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1322 src_reg header(this, glsl_type::uvec4_type, 2);
1323
1324 pull = new(mem_ctx)
1325 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1326 dst_reg(header));
1327
1328 if (before_inst)
1329 emit_before(before_block, before_inst, pull);
1330 else
1331 emit(pull);
1332
1333 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1334 offset_reg.type);
1335 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1336
1337 if (before_inst)
1338 emit_before(before_block, before_inst, pull);
1339 else
1340 emit(pull);
1341
1342 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1343 dst,
1344 surf_index,
1345 header);
1346 pull->mlen = 2;
1347 pull->header_size = 1;
1348 } else if (devinfo->gen >= 7) {
1349 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1350
1351 grf_offset.type = offset_reg.type;
1352
1353 pull = MOV(grf_offset, offset_reg);
1354
1355 if (before_inst)
1356 emit_before(before_block, before_inst, pull);
1357 else
1358 emit(pull);
1359
1360 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1361 dst,
1362 surf_index,
1363 src_reg(grf_offset));
1364 pull->mlen = 1;
1365 } else {
1366 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1367 dst,
1368 surf_index,
1369 offset_reg);
1370 pull->base_mrf = 14;
1371 pull->mlen = 1;
1372 }
1373
1374 if (before_inst)
1375 emit_before(before_block, before_inst, pull);
1376 else
1377 emit(pull);
1378 }
1379
1380 src_reg
1381 vec4_visitor::emit_uniformize(const src_reg &src)
1382 {
1383 const src_reg chan_index(this, glsl_type::uint_type);
1384 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1385 src.type);
1386
1387 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1388 ->force_writemask_all = true;
1389 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1390 ->force_writemask_all = true;
1391
1392 return src_reg(dst);
1393 }
1394
1395 void
1396 vec4_visitor::visit(ir_expression *ir)
1397 {
1398 unsigned int operand;
1399 src_reg op[ARRAY_SIZE(ir->operands)];
1400 vec4_instruction *inst;
1401
1402 if (ir->operation == ir_binop_add) {
1403 if (try_emit_mad(ir))
1404 return;
1405 }
1406
1407 if (ir->operation == ir_unop_b2f) {
1408 if (try_emit_b2f_of_compare(ir))
1409 return;
1410 }
1411
1412 /* Storage for our result. Ideally for an assignment we'd be using
1413 * the actual storage for the result here, instead.
1414 */
1415 dst_reg result_dst(this, ir->type);
1416 src_reg result_src(result_dst);
1417
1418 if (ir->operation == ir_triop_csel) {
1419 ir->operands[1]->accept(this);
1420 op[1] = this->result;
1421 ir->operands[2]->accept(this);
1422 op[2] = this->result;
1423
1424 enum brw_predicate predicate;
1425 emit_bool_to_cond_code(ir->operands[0], &predicate);
1426 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1427 inst->predicate = predicate;
1428 this->result = result_src;
1429 return;
1430 }
1431
1432 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1433 this->result.file = BAD_FILE;
1434 ir->operands[operand]->accept(this);
1435 if (this->result.file == BAD_FILE) {
1436 fprintf(stderr, "Failed to get tree for expression operand:\n");
1437 ir->operands[operand]->fprint(stderr);
1438 exit(1);
1439 }
1440 op[operand] = this->result;
1441
1442 /* Matrix expression operands should have been broken down to vector
1443 * operations already.
1444 */
1445 assert(!ir->operands[operand]->type->is_matrix());
1446 }
1447
1448 /* If nothing special happens, this is the result. */
1449 this->result = result_src;
1450
1451 switch (ir->operation) {
1452 case ir_unop_logic_not:
1453 emit(NOT(result_dst, op[0]));
1454 break;
1455 case ir_unop_neg:
1456 op[0].negate = !op[0].negate;
1457 emit(MOV(result_dst, op[0]));
1458 break;
1459 case ir_unop_abs:
1460 op[0].abs = true;
1461 op[0].negate = false;
1462 emit(MOV(result_dst, op[0]));
1463 break;
1464
1465 case ir_unop_sign:
1466 if (ir->type->is_float()) {
1467 /* AND(val, 0x80000000) gives the sign bit.
1468 *
1469 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1470 * zero.
1471 */
1472 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1473
1474 op[0].type = BRW_REGISTER_TYPE_UD;
1475 result_dst.type = BRW_REGISTER_TYPE_UD;
1476 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1477
1478 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1479 inst->predicate = BRW_PREDICATE_NORMAL;
1480
1481 this->result.type = BRW_REGISTER_TYPE_F;
1482 } else {
1483 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1484 * -> non-negative val generates 0x00000000.
1485 * Predicated OR sets 1 if val is positive.
1486 */
1487 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1488
1489 emit(ASR(result_dst, op[0], src_reg(31)));
1490
1491 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1492 inst->predicate = BRW_PREDICATE_NORMAL;
1493 }
1494 break;
1495
1496 case ir_unop_rcp:
1497 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1498 break;
1499
1500 case ir_unop_exp2:
1501 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1502 break;
1503 case ir_unop_log2:
1504 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1505 break;
1506 case ir_unop_exp:
1507 case ir_unop_log:
1508 unreachable("not reached: should be handled by ir_explog_to_explog2");
1509 case ir_unop_sin:
1510 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1511 break;
1512 case ir_unop_cos:
1513 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1514 break;
1515
1516 case ir_unop_dFdx:
1517 case ir_unop_dFdx_coarse:
1518 case ir_unop_dFdx_fine:
1519 case ir_unop_dFdy:
1520 case ir_unop_dFdy_coarse:
1521 case ir_unop_dFdy_fine:
1522 unreachable("derivatives not valid in vertex shader");
1523
1524 case ir_unop_bitfield_reverse:
1525 emit(BFREV(result_dst, op[0]));
1526 break;
1527 case ir_unop_bit_count:
1528 emit(CBIT(result_dst, op[0]));
1529 break;
1530 case ir_unop_find_msb: {
1531 src_reg temp = src_reg(this, glsl_type::uint_type);
1532
1533 inst = emit(FBH(dst_reg(temp), op[0]));
1534 inst->dst.writemask = WRITEMASK_XYZW;
1535
1536 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1537 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1538 * subtract the result from 31 to convert the MSB count into an LSB count.
1539 */
1540
1541 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1542 temp.swizzle = BRW_SWIZZLE_NOOP;
1543 emit(MOV(result_dst, temp));
1544
1545 src_reg src_tmp = src_reg(result_dst);
1546 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1547
1548 src_tmp.negate = true;
1549 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1550 inst->predicate = BRW_PREDICATE_NORMAL;
1551 break;
1552 }
1553 case ir_unop_find_lsb:
1554 emit(FBL(result_dst, op[0]));
1555 break;
1556 case ir_unop_saturate:
1557 inst = emit(MOV(result_dst, op[0]));
1558 inst->saturate = true;
1559 break;
1560
1561 case ir_unop_noise:
1562 unreachable("not reached: should be handled by lower_noise");
1563
1564 case ir_unop_subroutine_to_int:
1565 emit(MOV(result_dst, op[0]));
1566 break;
1567
1568 case ir_binop_add:
1569 emit(ADD(result_dst, op[0], op[1]));
1570 break;
1571 case ir_binop_sub:
1572 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1573
1574 case ir_binop_mul:
1575 if (devinfo->gen < 8 && ir->type->is_integer()) {
1576 /* For integer multiplication, the MUL uses the low 16 bits of one of
1577 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1578 * accumulates in the contribution of the upper 16 bits of that
1579 * operand. If we can determine that one of the args is in the low
1580 * 16 bits, though, we can just emit a single MUL.
1581 */
1582 if (ir->operands[0]->is_uint16_constant()) {
1583 if (devinfo->gen < 7)
1584 emit(MUL(result_dst, op[0], op[1]));
1585 else
1586 emit(MUL(result_dst, op[1], op[0]));
1587 } else if (ir->operands[1]->is_uint16_constant()) {
1588 if (devinfo->gen < 7)
1589 emit(MUL(result_dst, op[1], op[0]));
1590 else
1591 emit(MUL(result_dst, op[0], op[1]));
1592 } else {
1593 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1594
1595 emit(MUL(acc, op[0], op[1]));
1596 emit(MACH(dst_null_d(), op[0], op[1]));
1597 emit(MOV(result_dst, src_reg(acc)));
1598 }
1599 } else {
1600 emit(MUL(result_dst, op[0], op[1]));
1601 }
1602 break;
1603 case ir_binop_imul_high: {
1604 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1605
1606 emit(MUL(acc, op[0], op[1]));
1607 emit(MACH(result_dst, op[0], op[1]));
1608 break;
1609 }
1610 case ir_binop_div:
1611 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1612 assert(ir->type->is_integer());
1613 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1614 break;
1615
1616 case ir_binop_carry:
1617 unreachable("Should have been lowered by carry_to_arith().");
1618
1619 case ir_binop_borrow:
1620 unreachable("Should have been lowered by borrow_to_arith().");
1621
1622 case ir_binop_mod:
1623 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1624 assert(ir->type->is_integer());
1625 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1626 break;
1627
1628 case ir_binop_less:
1629 case ir_binop_greater:
1630 case ir_binop_lequal:
1631 case ir_binop_gequal:
1632 case ir_binop_equal:
1633 case ir_binop_nequal: {
1634 if (devinfo->gen <= 5) {
1635 resolve_bool_comparison(ir->operands[0], &op[0]);
1636 resolve_bool_comparison(ir->operands[1], &op[1]);
1637 }
1638 emit(CMP(result_dst, op[0], op[1],
1639 brw_conditional_for_comparison(ir->operation)));
1640 break;
1641 }
1642
1643 case ir_binop_all_equal:
1644 if (devinfo->gen <= 5) {
1645 resolve_bool_comparison(ir->operands[0], &op[0]);
1646 resolve_bool_comparison(ir->operands[1], &op[1]);
1647 }
1648
1649 /* "==" operator producing a scalar boolean. */
1650 if (ir->operands[0]->type->is_vector() ||
1651 ir->operands[1]->type->is_vector()) {
1652 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1653 emit(MOV(result_dst, src_reg(0)));
1654 inst = emit(MOV(result_dst, src_reg(~0)));
1655 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1656 } else {
1657 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1658 }
1659 break;
1660 case ir_binop_any_nequal:
1661 if (devinfo->gen <= 5) {
1662 resolve_bool_comparison(ir->operands[0], &op[0]);
1663 resolve_bool_comparison(ir->operands[1], &op[1]);
1664 }
1665
1666 /* "!=" operator producing a scalar boolean. */
1667 if (ir->operands[0]->type->is_vector() ||
1668 ir->operands[1]->type->is_vector()) {
1669 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1670
1671 emit(MOV(result_dst, src_reg(0)));
1672 inst = emit(MOV(result_dst, src_reg(~0)));
1673 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1674 } else {
1675 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1676 }
1677 break;
1678
1679 case ir_unop_any:
1680 if (devinfo->gen <= 5) {
1681 resolve_bool_comparison(ir->operands[0], &op[0]);
1682 }
1683 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1684 emit(MOV(result_dst, src_reg(0)));
1685
1686 inst = emit(MOV(result_dst, src_reg(~0)));
1687 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1688 break;
1689
1690 case ir_binop_logic_xor:
1691 emit(XOR(result_dst, op[0], op[1]));
1692 break;
1693
1694 case ir_binop_logic_or:
1695 emit(OR(result_dst, op[0], op[1]));
1696 break;
1697
1698 case ir_binop_logic_and:
1699 emit(AND(result_dst, op[0], op[1]));
1700 break;
1701
1702 case ir_binop_dot:
1703 assert(ir->operands[0]->type->is_vector());
1704 assert(ir->operands[0]->type == ir->operands[1]->type);
1705 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1706 break;
1707
1708 case ir_unop_sqrt:
1709 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1710 break;
1711 case ir_unop_rsq:
1712 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1713 break;
1714
1715 case ir_unop_bitcast_i2f:
1716 case ir_unop_bitcast_u2f:
1717 this->result = op[0];
1718 this->result.type = BRW_REGISTER_TYPE_F;
1719 break;
1720
1721 case ir_unop_bitcast_f2i:
1722 this->result = op[0];
1723 this->result.type = BRW_REGISTER_TYPE_D;
1724 break;
1725
1726 case ir_unop_bitcast_f2u:
1727 this->result = op[0];
1728 this->result.type = BRW_REGISTER_TYPE_UD;
1729 break;
1730
1731 case ir_unop_i2f:
1732 case ir_unop_i2u:
1733 case ir_unop_u2i:
1734 case ir_unop_u2f:
1735 case ir_unop_f2i:
1736 case ir_unop_f2u:
1737 emit(MOV(result_dst, op[0]));
1738 break;
1739 case ir_unop_b2i:
1740 case ir_unop_b2f:
1741 if (devinfo->gen <= 5) {
1742 resolve_bool_comparison(ir->operands[0], &op[0]);
1743 }
1744 emit(MOV(result_dst, negate(op[0])));
1745 break;
1746 case ir_unop_f2b:
1747 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1748 break;
1749 case ir_unop_i2b:
1750 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1751 break;
1752
1753 case ir_unop_trunc:
1754 emit(RNDZ(result_dst, op[0]));
1755 break;
1756 case ir_unop_ceil: {
1757 src_reg tmp = src_reg(this, ir->type);
1758 op[0].negate = !op[0].negate;
1759 emit(RNDD(dst_reg(tmp), op[0]));
1760 tmp.negate = true;
1761 emit(MOV(result_dst, tmp));
1762 }
1763 break;
1764 case ir_unop_floor:
1765 inst = emit(RNDD(result_dst, op[0]));
1766 break;
1767 case ir_unop_fract:
1768 inst = emit(FRC(result_dst, op[0]));
1769 break;
1770 case ir_unop_round_even:
1771 emit(RNDE(result_dst, op[0]));
1772 break;
1773
1774 case ir_binop_min:
1775 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1776 break;
1777 case ir_binop_max:
1778 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1779 break;
1780
1781 case ir_binop_pow:
1782 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1783 break;
1784
1785 case ir_unop_bit_not:
1786 inst = emit(NOT(result_dst, op[0]));
1787 break;
1788 case ir_binop_bit_and:
1789 inst = emit(AND(result_dst, op[0], op[1]));
1790 break;
1791 case ir_binop_bit_xor:
1792 inst = emit(XOR(result_dst, op[0], op[1]));
1793 break;
1794 case ir_binop_bit_or:
1795 inst = emit(OR(result_dst, op[0], op[1]));
1796 break;
1797
1798 case ir_binop_lshift:
1799 inst = emit(SHL(result_dst, op[0], op[1]));
1800 break;
1801
1802 case ir_binop_rshift:
1803 if (ir->type->base_type == GLSL_TYPE_INT)
1804 inst = emit(ASR(result_dst, op[0], op[1]));
1805 else
1806 inst = emit(SHR(result_dst, op[0], op[1]));
1807 break;
1808
1809 case ir_binop_bfm:
1810 emit(BFI1(result_dst, op[0], op[1]));
1811 break;
1812
1813 case ir_binop_ubo_load: {
1814 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1815 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1816 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1817 src_reg offset;
1818
1819 /* Now, load the vector from that offset. */
1820 assert(ir->type->is_vector() || ir->type->is_scalar());
1821
1822 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1823 packed_consts.type = result.type;
1824 src_reg surf_index;
1825
1826 if (const_uniform_block) {
1827 /* The block index is a constant, so just emit the binding table entry
1828 * as an immediate.
1829 */
1830 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1831 const_uniform_block->value.u[0]);
1832 } else {
1833 /* The block index is not a constant. Evaluate the index expression
1834 * per-channel and add the base UBO index; we have to select a value
1835 * from any live channel.
1836 */
1837 surf_index = src_reg(this, glsl_type::uint_type);
1838 emit(ADD(dst_reg(surf_index), op[0],
1839 src_reg(prog_data->base.binding_table.ubo_start)));
1840 surf_index = emit_uniformize(surf_index);
1841
1842 /* Assume this may touch any UBO. It would be nice to provide
1843 * a tighter bound, but the array information is already lowered away.
1844 */
1845 brw_mark_surface_used(&prog_data->base,
1846 prog_data->base.binding_table.ubo_start +
1847 shader_prog->NumUniformBlocks - 1);
1848 }
1849
1850 if (const_offset_ir) {
1851 if (devinfo->gen >= 8) {
1852 /* Store the offset in a GRF so we can send-from-GRF. */
1853 offset = src_reg(this, glsl_type::int_type);
1854 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1855 } else {
1856 /* Immediates are fine on older generations since they'll be moved
1857 * to a (potentially fake) MRF at the generator level.
1858 */
1859 offset = src_reg(const_offset / 16);
1860 }
1861 } else {
1862 offset = src_reg(this, glsl_type::uint_type);
1863 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1864 }
1865
1866 emit_pull_constant_load_reg(dst_reg(packed_consts),
1867 surf_index,
1868 offset,
1869 NULL, NULL /* before_block/inst */);
1870
1871 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1872 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1873 const_offset % 16 / 4,
1874 const_offset % 16 / 4,
1875 const_offset % 16 / 4);
1876
1877 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1878 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1879 emit(CMP(result_dst, packed_consts, src_reg(0u),
1880 BRW_CONDITIONAL_NZ));
1881 } else {
1882 emit(MOV(result_dst, packed_consts));
1883 }
1884 break;
1885 }
1886
1887 case ir_binop_vector_extract:
1888 unreachable("should have been lowered by vec_index_to_cond_assign");
1889
1890 case ir_triop_fma:
1891 op[0] = fix_3src_operand(op[0]);
1892 op[1] = fix_3src_operand(op[1]);
1893 op[2] = fix_3src_operand(op[2]);
1894 /* Note that the instruction's argument order is reversed from GLSL
1895 * and the IR.
1896 */
1897 emit(MAD(result_dst, op[2], op[1], op[0]));
1898 break;
1899
1900 case ir_triop_lrp:
1901 emit_lrp(result_dst, op[0], op[1], op[2]);
1902 break;
1903
1904 case ir_triop_csel:
1905 unreachable("already handled above");
1906 break;
1907
1908 case ir_triop_bfi:
1909 op[0] = fix_3src_operand(op[0]);
1910 op[1] = fix_3src_operand(op[1]);
1911 op[2] = fix_3src_operand(op[2]);
1912 emit(BFI2(result_dst, op[0], op[1], op[2]));
1913 break;
1914
1915 case ir_triop_bitfield_extract:
1916 op[0] = fix_3src_operand(op[0]);
1917 op[1] = fix_3src_operand(op[1]);
1918 op[2] = fix_3src_operand(op[2]);
1919 /* Note that the instruction's argument order is reversed from GLSL
1920 * and the IR.
1921 */
1922 emit(BFE(result_dst, op[2], op[1], op[0]));
1923 break;
1924
1925 case ir_triop_vector_insert:
1926 unreachable("should have been lowered by lower_vector_insert");
1927
1928 case ir_quadop_bitfield_insert:
1929 unreachable("not reached: should be handled by "
1930 "bitfield_insert_to_bfm_bfi\n");
1931
1932 case ir_quadop_vector:
1933 unreachable("not reached: should be handled by lower_quadop_vector");
1934
1935 case ir_unop_pack_half_2x16:
1936 emit_pack_half_2x16(result_dst, op[0]);
1937 break;
1938 case ir_unop_unpack_half_2x16:
1939 emit_unpack_half_2x16(result_dst, op[0]);
1940 break;
1941 case ir_unop_unpack_unorm_4x8:
1942 emit_unpack_unorm_4x8(result_dst, op[0]);
1943 break;
1944 case ir_unop_unpack_snorm_4x8:
1945 emit_unpack_snorm_4x8(result_dst, op[0]);
1946 break;
1947 case ir_unop_pack_unorm_4x8:
1948 emit_pack_unorm_4x8(result_dst, op[0]);
1949 break;
1950 case ir_unop_pack_snorm_4x8:
1951 emit_pack_snorm_4x8(result_dst, op[0]);
1952 break;
1953 case ir_unop_pack_snorm_2x16:
1954 case ir_unop_pack_unorm_2x16:
1955 case ir_unop_unpack_snorm_2x16:
1956 case ir_unop_unpack_unorm_2x16:
1957 unreachable("not reached: should be handled by lower_packing_builtins");
1958 case ir_unop_unpack_half_2x16_split_x:
1959 case ir_unop_unpack_half_2x16_split_y:
1960 case ir_binop_pack_half_2x16_split:
1961 case ir_unop_interpolate_at_centroid:
1962 case ir_binop_interpolate_at_sample:
1963 case ir_binop_interpolate_at_offset:
1964 unreachable("not reached: should not occur in vertex shader");
1965 case ir_binop_ldexp:
1966 unreachable("not reached: should be handled by ldexp_to_arith()");
1967 case ir_unop_d2f:
1968 case ir_unop_f2d:
1969 case ir_unop_d2i:
1970 case ir_unop_i2d:
1971 case ir_unop_d2u:
1972 case ir_unop_u2d:
1973 case ir_unop_d2b:
1974 case ir_unop_pack_double_2x32:
1975 case ir_unop_unpack_double_2x32:
1976 case ir_unop_frexp_sig:
1977 case ir_unop_frexp_exp:
1978 unreachable("fp64 todo");
1979 }
1980 }
1981
1982
1983 void
1984 vec4_visitor::visit(ir_swizzle *ir)
1985 {
1986 /* Note that this is only swizzles in expressions, not those on the left
1987 * hand side of an assignment, which do write masking. See ir_assignment
1988 * for that.
1989 */
1990 const unsigned swz = brw_compose_swizzle(
1991 brw_swizzle_for_size(ir->type->vector_elements),
1992 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1993
1994 ir->val->accept(this);
1995 this->result = swizzle(this->result, swz);
1996 }
1997
1998 void
1999 vec4_visitor::visit(ir_dereference_variable *ir)
2000 {
2001 const struct glsl_type *type = ir->type;
2002 dst_reg *reg = variable_storage(ir->var);
2003
2004 if (!reg) {
2005 fail("Failed to find variable storage for %s\n", ir->var->name);
2006 this->result = src_reg(brw_null_reg());
2007 return;
2008 }
2009
2010 this->result = src_reg(*reg);
2011
2012 /* System values get their swizzle from the dst_reg writemask */
2013 if (ir->var->data.mode == ir_var_system_value)
2014 return;
2015
2016 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2017 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2018 }
2019
2020
2021 int
2022 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2023 {
2024 /* Under normal circumstances array elements are stored consecutively, so
2025 * the stride is equal to the size of the array element.
2026 */
2027 return type_size(ir->type);
2028 }
2029
2030
2031 void
2032 vec4_visitor::visit(ir_dereference_array *ir)
2033 {
2034 ir_constant *constant_index;
2035 src_reg src;
2036 int array_stride = compute_array_stride(ir);
2037
2038 constant_index = ir->array_index->constant_expression_value();
2039
2040 ir->array->accept(this);
2041 src = this->result;
2042
2043 if (constant_index) {
2044 src.reg_offset += constant_index->value.i[0] * array_stride;
2045 } else {
2046 /* Variable index array dereference. It eats the "vec4" of the
2047 * base of the array and an index that offsets the Mesa register
2048 * index.
2049 */
2050 ir->array_index->accept(this);
2051
2052 src_reg index_reg;
2053
2054 if (array_stride == 1) {
2055 index_reg = this->result;
2056 } else {
2057 index_reg = src_reg(this, glsl_type::int_type);
2058
2059 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2060 }
2061
2062 if (src.reladdr) {
2063 src_reg temp = src_reg(this, glsl_type::int_type);
2064
2065 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2066
2067 index_reg = temp;
2068 }
2069
2070 src.reladdr = ralloc(mem_ctx, src_reg);
2071 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2072 }
2073
2074 /* If the type is smaller than a vec4, replicate the last channel out. */
2075 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2076 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2077 else
2078 src.swizzle = BRW_SWIZZLE_NOOP;
2079 src.type = brw_type_for_base_type(ir->type);
2080
2081 this->result = src;
2082 }
2083
2084 void
2085 vec4_visitor::visit(ir_dereference_record *ir)
2086 {
2087 unsigned int i;
2088 const glsl_type *struct_type = ir->record->type;
2089 int offset = 0;
2090
2091 ir->record->accept(this);
2092
2093 for (i = 0; i < struct_type->length; i++) {
2094 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2095 break;
2096 offset += type_size(struct_type->fields.structure[i].type);
2097 }
2098
2099 /* If the type is smaller than a vec4, replicate the last channel out. */
2100 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2101 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2102 else
2103 this->result.swizzle = BRW_SWIZZLE_NOOP;
2104 this->result.type = brw_type_for_base_type(ir->type);
2105
2106 this->result.reg_offset += offset;
2107 }
2108
2109 /**
2110 * We want to be careful in assignment setup to hit the actual storage
2111 * instead of potentially using a temporary like we might with the
2112 * ir_dereference handler.
2113 */
2114 static dst_reg
2115 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2116 {
2117 /* The LHS must be a dereference. If the LHS is a variable indexed array
2118 * access of a vector, it must be separated into a series conditional moves
2119 * before reaching this point (see ir_vec_index_to_cond_assign).
2120 */
2121 assert(ir->as_dereference());
2122 ir_dereference_array *deref_array = ir->as_dereference_array();
2123 if (deref_array) {
2124 assert(!deref_array->array->type->is_vector());
2125 }
2126
2127 /* Use the rvalue deref handler for the most part. We'll ignore
2128 * swizzles in it and write swizzles using writemask, though.
2129 */
2130 ir->accept(v);
2131 return dst_reg(v->result);
2132 }
2133
2134 void
2135 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2136 const struct glsl_type *type,
2137 enum brw_predicate predicate)
2138 {
2139 if (type->base_type == GLSL_TYPE_STRUCT) {
2140 for (unsigned int i = 0; i < type->length; i++) {
2141 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2142 }
2143 return;
2144 }
2145
2146 if (type->is_array()) {
2147 for (unsigned int i = 0; i < type->length; i++) {
2148 emit_block_move(dst, src, type->fields.array, predicate);
2149 }
2150 return;
2151 }
2152
2153 if (type->is_matrix()) {
2154 const struct glsl_type *vec_type;
2155
2156 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2157 type->vector_elements, 1);
2158
2159 for (int i = 0; i < type->matrix_columns; i++) {
2160 emit_block_move(dst, src, vec_type, predicate);
2161 }
2162 return;
2163 }
2164
2165 assert(type->is_scalar() || type->is_vector());
2166
2167 dst->type = brw_type_for_base_type(type);
2168 src->type = dst->type;
2169
2170 dst->writemask = (1 << type->vector_elements) - 1;
2171
2172 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2173
2174 vec4_instruction *inst = emit(MOV(*dst, *src));
2175 inst->predicate = predicate;
2176
2177 dst->reg_offset++;
2178 src->reg_offset++;
2179 }
2180
2181
2182 /* If the RHS processing resulted in an instruction generating a
2183 * temporary value, and it would be easy to rewrite the instruction to
2184 * generate its result right into the LHS instead, do so. This ends
2185 * up reliably removing instructions where it can be tricky to do so
2186 * later without real UD chain information.
2187 */
2188 bool
2189 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2190 dst_reg dst,
2191 src_reg src,
2192 vec4_instruction *pre_rhs_inst,
2193 vec4_instruction *last_rhs_inst)
2194 {
2195 /* This could be supported, but it would take more smarts. */
2196 if (ir->condition)
2197 return false;
2198
2199 if (pre_rhs_inst == last_rhs_inst)
2200 return false; /* No instructions generated to work with. */
2201
2202 /* Make sure the last instruction generated our source reg. */
2203 if (src.file != GRF ||
2204 src.file != last_rhs_inst->dst.file ||
2205 src.reg != last_rhs_inst->dst.reg ||
2206 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2207 src.reladdr ||
2208 src.abs ||
2209 src.negate ||
2210 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2211 return false;
2212
2213 /* Check that that last instruction fully initialized the channels
2214 * we want to use, in the order we want to use them. We could
2215 * potentially reswizzle the operands of many instructions so that
2216 * we could handle out of order channels, but don't yet.
2217 */
2218
2219 for (unsigned i = 0; i < 4; i++) {
2220 if (dst.writemask & (1 << i)) {
2221 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2222 return false;
2223
2224 if (BRW_GET_SWZ(src.swizzle, i) != i)
2225 return false;
2226 }
2227 }
2228
2229 /* Success! Rewrite the instruction. */
2230 last_rhs_inst->dst.file = dst.file;
2231 last_rhs_inst->dst.reg = dst.reg;
2232 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2233 last_rhs_inst->dst.reladdr = dst.reladdr;
2234 last_rhs_inst->dst.writemask &= dst.writemask;
2235
2236 return true;
2237 }
2238
2239 void
2240 vec4_visitor::visit(ir_assignment *ir)
2241 {
2242 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2243 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2244
2245 if (!ir->lhs->type->is_scalar() &&
2246 !ir->lhs->type->is_vector()) {
2247 ir->rhs->accept(this);
2248 src_reg src = this->result;
2249
2250 if (ir->condition) {
2251 emit_bool_to_cond_code(ir->condition, &predicate);
2252 }
2253
2254 /* emit_block_move doesn't account for swizzles in the source register.
2255 * This should be ok, since the source register is a structure or an
2256 * array, and those can't be swizzled. But double-check to be sure.
2257 */
2258 assert(src.swizzle ==
2259 (ir->rhs->type->is_matrix()
2260 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2261 : BRW_SWIZZLE_NOOP));
2262
2263 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2264 return;
2265 }
2266
2267 /* Now we're down to just a scalar/vector with writemasks. */
2268 int i;
2269
2270 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2271 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2272
2273 ir->rhs->accept(this);
2274
2275 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2276
2277 int swizzles[4];
2278 int src_chan = 0;
2279
2280 assert(ir->lhs->type->is_vector() ||
2281 ir->lhs->type->is_scalar());
2282 dst.writemask = ir->write_mask;
2283
2284 /* Swizzle a small RHS vector into the channels being written.
2285 *
2286 * glsl ir treats write_mask as dictating how many channels are
2287 * present on the RHS while in our instructions we need to make
2288 * those channels appear in the slots of the vec4 they're written to.
2289 */
2290 for (int i = 0; i < 4; i++)
2291 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2292
2293 src_reg src = swizzle(this->result,
2294 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2295 swizzles[2], swizzles[3]));
2296
2297 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2298 return;
2299 }
2300
2301 if (ir->condition) {
2302 emit_bool_to_cond_code(ir->condition, &predicate);
2303 }
2304
2305 for (i = 0; i < type_size(ir->lhs->type); i++) {
2306 vec4_instruction *inst = emit(MOV(dst, src));
2307 inst->predicate = predicate;
2308
2309 dst.reg_offset++;
2310 src.reg_offset++;
2311 }
2312 }
2313
2314 void
2315 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2316 {
2317 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2318 foreach_in_list(ir_constant, field_value, &ir->components) {
2319 emit_constant_values(dst, field_value);
2320 }
2321 return;
2322 }
2323
2324 if (ir->type->is_array()) {
2325 for (unsigned int i = 0; i < ir->type->length; i++) {
2326 emit_constant_values(dst, ir->array_elements[i]);
2327 }
2328 return;
2329 }
2330
2331 if (ir->type->is_matrix()) {
2332 for (int i = 0; i < ir->type->matrix_columns; i++) {
2333 float *vec = &ir->value.f[i * ir->type->vector_elements];
2334
2335 for (int j = 0; j < ir->type->vector_elements; j++) {
2336 dst->writemask = 1 << j;
2337 dst->type = BRW_REGISTER_TYPE_F;
2338
2339 emit(MOV(*dst, src_reg(vec[j])));
2340 }
2341 dst->reg_offset++;
2342 }
2343 return;
2344 }
2345
2346 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2347
2348 for (int i = 0; i < ir->type->vector_elements; i++) {
2349 if (!(remaining_writemask & (1 << i)))
2350 continue;
2351
2352 dst->writemask = 1 << i;
2353 dst->type = brw_type_for_base_type(ir->type);
2354
2355 /* Find other components that match the one we're about to
2356 * write. Emits fewer instructions for things like vec4(0.5,
2357 * 1.5, 1.5, 1.5).
2358 */
2359 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2360 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2361 if (ir->value.b[i] == ir->value.b[j])
2362 dst->writemask |= (1 << j);
2363 } else {
2364 /* u, i, and f storage all line up, so no need for a
2365 * switch case for comparing each type.
2366 */
2367 if (ir->value.u[i] == ir->value.u[j])
2368 dst->writemask |= (1 << j);
2369 }
2370 }
2371
2372 switch (ir->type->base_type) {
2373 case GLSL_TYPE_FLOAT:
2374 emit(MOV(*dst, src_reg(ir->value.f[i])));
2375 break;
2376 case GLSL_TYPE_INT:
2377 emit(MOV(*dst, src_reg(ir->value.i[i])));
2378 break;
2379 case GLSL_TYPE_UINT:
2380 emit(MOV(*dst, src_reg(ir->value.u[i])));
2381 break;
2382 case GLSL_TYPE_BOOL:
2383 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2384 break;
2385 default:
2386 unreachable("Non-float/uint/int/bool constant");
2387 }
2388
2389 remaining_writemask &= ~dst->writemask;
2390 }
2391 dst->reg_offset++;
2392 }
2393
2394 void
2395 vec4_visitor::visit(ir_constant *ir)
2396 {
2397 dst_reg dst = dst_reg(this, ir->type);
2398 this->result = src_reg(dst);
2399
2400 emit_constant_values(&dst, ir);
2401 }
2402
2403 void
2404 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2405 {
2406 ir_dereference *deref = static_cast<ir_dereference *>(
2407 ir->actual_parameters.get_head());
2408 ir_variable *location = deref->variable_referenced();
2409 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2410 location->data.binding);
2411
2412 /* Calculate the surface offset */
2413 src_reg offset(this, glsl_type::uint_type);
2414 ir_dereference_array *deref_array = deref->as_dereference_array();
2415 if (deref_array) {
2416 deref_array->array_index->accept(this);
2417
2418 src_reg tmp(this, glsl_type::uint_type);
2419 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2420 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2421 } else {
2422 offset = location->data.atomic.offset;
2423 }
2424
2425 /* Emit the appropriate machine instruction */
2426 const char *callee = ir->callee->function_name();
2427 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2428
2429 if (!strcmp("__intrinsic_atomic_read", callee)) {
2430 emit_untyped_surface_read(surf_index, dst, offset);
2431
2432 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2433 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2434 src_reg(), src_reg());
2435
2436 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2437 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2438 src_reg(), src_reg());
2439 }
2440 }
2441
2442 void
2443 vec4_visitor::visit(ir_call *ir)
2444 {
2445 const char *callee = ir->callee->function_name();
2446
2447 if (!strcmp("__intrinsic_atomic_read", callee) ||
2448 !strcmp("__intrinsic_atomic_increment", callee) ||
2449 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2450 visit_atomic_counter_intrinsic(ir);
2451 } else {
2452 unreachable("Unsupported intrinsic.");
2453 }
2454 }
2455
2456 src_reg
2457 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2458 {
2459 vec4_instruction *inst =
2460 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2461 dst_reg(this, glsl_type::uvec4_type));
2462 inst->base_mrf = 2;
2463 inst->src[1] = sampler;
2464
2465 int param_base;
2466
2467 if (devinfo->gen >= 9) {
2468 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2469 vec4_instruction *header_inst = new(mem_ctx)
2470 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2471 dst_reg(MRF, inst->base_mrf));
2472
2473 emit(header_inst);
2474
2475 inst->mlen = 2;
2476 inst->header_size = 1;
2477 param_base = inst->base_mrf + 1;
2478 } else {
2479 inst->mlen = 1;
2480 param_base = inst->base_mrf;
2481 }
2482
2483 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2484 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2485 int zero_mask = 0xf & ~coord_mask;
2486
2487 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2488 coordinate));
2489
2490 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2491 src_reg(0)));
2492
2493 emit(inst);
2494 return src_reg(inst->dst);
2495 }
2496
2497 static bool
2498 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2499 {
2500 if (devinfo->gen < 8 && !devinfo->is_haswell)
2501 return false;
2502
2503 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2504 }
2505
2506 void
2507 vec4_visitor::visit(ir_texture *ir)
2508 {
2509 uint32_t sampler =
2510 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2511
2512 ir_rvalue *nonconst_sampler_index =
2513 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2514
2515 /* Handle non-constant sampler array indexing */
2516 src_reg sampler_reg;
2517 if (nonconst_sampler_index) {
2518 /* The highest sampler which may be used by this operation is
2519 * the last element of the array. Mark it here, because the generator
2520 * doesn't have enough information to determine the bound.
2521 */
2522 uint32_t array_size = ir->sampler->as_dereference_array()
2523 ->array->type->array_size();
2524
2525 uint32_t max_used = sampler + array_size - 1;
2526 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2527 max_used += prog_data->base.binding_table.gather_texture_start;
2528 } else {
2529 max_used += prog_data->base.binding_table.texture_start;
2530 }
2531
2532 brw_mark_surface_used(&prog_data->base, max_used);
2533
2534 /* Emit code to evaluate the actual indexing expression */
2535 nonconst_sampler_index->accept(this);
2536 src_reg temp(this, glsl_type::uint_type);
2537 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2538 sampler_reg = emit_uniformize(temp);
2539 } else {
2540 /* Single sampler, or constant array index; the indexing expression
2541 * is just an immediate.
2542 */
2543 sampler_reg = src_reg(sampler);
2544 }
2545
2546 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2547 * emitting anything other than setting up the constant result.
2548 */
2549 if (ir->op == ir_tg4) {
2550 ir_constant *chan = ir->lod_info.component->as_constant();
2551 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2552 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2553 dst_reg result(this, ir->type);
2554 this->result = src_reg(result);
2555 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2556 return;
2557 }
2558 }
2559
2560 /* Should be lowered by do_lower_texture_projection */
2561 assert(!ir->projector);
2562
2563 /* Should be lowered */
2564 assert(!ir->offset || !ir->offset->type->is_array());
2565
2566 /* Generate code to compute all the subexpression trees. This has to be
2567 * done before loading any values into MRFs for the sampler message since
2568 * generating these values may involve SEND messages that need the MRFs.
2569 */
2570 src_reg coordinate;
2571 if (ir->coordinate) {
2572 ir->coordinate->accept(this);
2573 coordinate = this->result;
2574 }
2575
2576 src_reg shadow_comparitor;
2577 if (ir->shadow_comparitor) {
2578 ir->shadow_comparitor->accept(this);
2579 shadow_comparitor = this->result;
2580 }
2581
2582 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2583 src_reg offset_value;
2584 if (has_nonconstant_offset) {
2585 ir->offset->accept(this);
2586 offset_value = src_reg(this->result);
2587 }
2588
2589 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2590 src_reg lod, dPdx, dPdy, sample_index, mcs;
2591 switch (ir->op) {
2592 case ir_tex:
2593 lod = src_reg(0.0f);
2594 lod_type = glsl_type::float_type;
2595 break;
2596 case ir_txf:
2597 case ir_txl:
2598 case ir_txs:
2599 ir->lod_info.lod->accept(this);
2600 lod = this->result;
2601 lod_type = ir->lod_info.lod->type;
2602 break;
2603 case ir_query_levels:
2604 lod = src_reg(0);
2605 lod_type = glsl_type::int_type;
2606 break;
2607 case ir_txf_ms:
2608 ir->lod_info.sample_index->accept(this);
2609 sample_index = this->result;
2610 sample_index_type = ir->lod_info.sample_index->type;
2611
2612 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2613 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2614 else
2615 mcs = src_reg(0u);
2616 break;
2617 case ir_txd:
2618 ir->lod_info.grad.dPdx->accept(this);
2619 dPdx = this->result;
2620
2621 ir->lod_info.grad.dPdy->accept(this);
2622 dPdy = this->result;
2623
2624 lod_type = ir->lod_info.grad.dPdx->type;
2625 break;
2626 case ir_txb:
2627 case ir_lod:
2628 case ir_tg4:
2629 break;
2630 }
2631
2632 enum opcode opcode;
2633 switch (ir->op) {
2634 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2635 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2636 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2637 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2638 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2639 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2640 case ir_tg4: opcode = has_nonconstant_offset
2641 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2642 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2643 case ir_txb:
2644 unreachable("TXB is not valid for vertex shaders.");
2645 case ir_lod:
2646 unreachable("LOD is not valid for vertex shaders.");
2647 default:
2648 unreachable("Unrecognized tex op");
2649 }
2650
2651 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2652 opcode, dst_reg(this, ir->type));
2653
2654 if (ir->offset != NULL && !has_nonconstant_offset) {
2655 inst->offset =
2656 brw_texture_offset(ir->offset->as_constant()->value.i,
2657 ir->offset->type->vector_elements);
2658 }
2659
2660 /* Stuff the channel select bits in the top of the texture offset */
2661 if (ir->op == ir_tg4)
2662 inst->offset |= gather_channel(ir, sampler) << 16;
2663
2664 /* The message header is necessary for:
2665 * - Gen4 (always)
2666 * - Gen9+ for selecting SIMD4x2
2667 * - Texel offsets
2668 * - Gather channel selection
2669 * - Sampler indices too large to fit in a 4-bit value.
2670 */
2671 inst->header_size =
2672 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2673 inst->offset != 0 || ir->op == ir_tg4 ||
2674 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2675 inst->base_mrf = 2;
2676 inst->mlen = inst->header_size + 1; /* always at least one */
2677 inst->dst.writemask = WRITEMASK_XYZW;
2678 inst->shadow_compare = ir->shadow_comparitor != NULL;
2679
2680 inst->src[1] = sampler_reg;
2681
2682 /* MRF for the first parameter */
2683 int param_base = inst->base_mrf + inst->header_size;
2684
2685 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2686 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2687 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2688 } else {
2689 /* Load the coordinate */
2690 /* FINISHME: gl_clamp_mask and saturate */
2691 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2692 int zero_mask = 0xf & ~coord_mask;
2693
2694 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2695 coordinate));
2696
2697 if (zero_mask != 0) {
2698 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2699 src_reg(0)));
2700 }
2701 /* Load the shadow comparitor */
2702 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2703 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2704 WRITEMASK_X),
2705 shadow_comparitor));
2706 inst->mlen++;
2707 }
2708
2709 /* Load the LOD info */
2710 if (ir->op == ir_tex || ir->op == ir_txl) {
2711 int mrf, writemask;
2712 if (devinfo->gen >= 5) {
2713 mrf = param_base + 1;
2714 if (ir->shadow_comparitor) {
2715 writemask = WRITEMASK_Y;
2716 /* mlen already incremented */
2717 } else {
2718 writemask = WRITEMASK_X;
2719 inst->mlen++;
2720 }
2721 } else /* devinfo->gen == 4 */ {
2722 mrf = param_base;
2723 writemask = WRITEMASK_W;
2724 }
2725 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2726 } else if (ir->op == ir_txf) {
2727 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2728 } else if (ir->op == ir_txf_ms) {
2729 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2730 sample_index));
2731 if (devinfo->gen >= 7) {
2732 /* MCS data is in the first channel of `mcs`, but we need to get it into
2733 * the .y channel of the second vec4 of params, so replicate .x across
2734 * the whole vec4 and then mask off everything except .y
2735 */
2736 mcs.swizzle = BRW_SWIZZLE_XXXX;
2737 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2738 mcs));
2739 }
2740 inst->mlen++;
2741 } else if (ir->op == ir_txd) {
2742 const glsl_type *type = lod_type;
2743
2744 if (devinfo->gen >= 5) {
2745 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2746 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2747 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2748 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2749 inst->mlen++;
2750
2751 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2752 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2753 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2754 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2755 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2756 inst->mlen++;
2757
2758 if (ir->shadow_comparitor) {
2759 emit(MOV(dst_reg(MRF, param_base + 2,
2760 ir->shadow_comparitor->type, WRITEMASK_Z),
2761 shadow_comparitor));
2762 }
2763 }
2764 } else /* devinfo->gen == 4 */ {
2765 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2766 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2767 inst->mlen += 2;
2768 }
2769 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2770 if (ir->shadow_comparitor) {
2771 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2772 shadow_comparitor));
2773 }
2774
2775 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2776 offset_value));
2777 inst->mlen++;
2778 }
2779 }
2780
2781 emit(inst);
2782
2783 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2784 * spec requires layers.
2785 */
2786 if (ir->op == ir_txs) {
2787 glsl_type const *type = ir->sampler->type;
2788 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2789 type->sampler_array) {
2790 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2791 writemask(inst->dst, WRITEMASK_Z),
2792 src_reg(inst->dst), src_reg(6));
2793 }
2794 }
2795
2796 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2797 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2798 }
2799
2800 swizzle_result(ir, src_reg(inst->dst), sampler);
2801 }
2802
2803 /**
2804 * Apply workarounds for Gen6 gather with UINT/SINT
2805 */
2806 void
2807 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2808 {
2809 if (!wa)
2810 return;
2811
2812 int width = (wa & WA_8BIT) ? 8 : 16;
2813 dst_reg dst_f = dst;
2814 dst_f.type = BRW_REGISTER_TYPE_F;
2815
2816 /* Convert from UNORM to UINT */
2817 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2818 emit(MOV(dst, src_reg(dst_f)));
2819
2820 if (wa & WA_SIGN) {
2821 /* Reinterpret the UINT value as a signed INT value by
2822 * shifting the sign bit into place, then shifting back
2823 * preserving sign.
2824 */
2825 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2826 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2827 }
2828 }
2829
2830 /**
2831 * Set up the gather channel based on the swizzle, for gather4.
2832 */
2833 uint32_t
2834 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2835 {
2836 ir_constant *chan = ir->lod_info.component->as_constant();
2837 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2838 switch (swiz) {
2839 case SWIZZLE_X: return 0;
2840 case SWIZZLE_Y:
2841 /* gather4 sampler is broken for green channel on RG32F --
2842 * we must ask for blue instead.
2843 */
2844 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2845 return 2;
2846 return 1;
2847 case SWIZZLE_Z: return 2;
2848 case SWIZZLE_W: return 3;
2849 default:
2850 unreachable("Not reached"); /* zero, one swizzles handled already */
2851 }
2852 }
2853
2854 void
2855 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2856 {
2857 int s = key->tex.swizzles[sampler];
2858
2859 this->result = src_reg(this, ir->type);
2860 dst_reg swizzled_result(this->result);
2861
2862 if (ir->op == ir_query_levels) {
2863 /* # levels is in .w */
2864 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2865 emit(MOV(swizzled_result, orig_val));
2866 return;
2867 }
2868
2869 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2870 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2871 emit(MOV(swizzled_result, orig_val));
2872 return;
2873 }
2874
2875
2876 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2877 int swizzle[4] = {0};
2878
2879 for (int i = 0; i < 4; i++) {
2880 switch (GET_SWZ(s, i)) {
2881 case SWIZZLE_ZERO:
2882 zero_mask |= (1 << i);
2883 break;
2884 case SWIZZLE_ONE:
2885 one_mask |= (1 << i);
2886 break;
2887 default:
2888 copy_mask |= (1 << i);
2889 swizzle[i] = GET_SWZ(s, i);
2890 break;
2891 }
2892 }
2893
2894 if (copy_mask) {
2895 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2896 swizzled_result.writemask = copy_mask;
2897 emit(MOV(swizzled_result, orig_val));
2898 }
2899
2900 if (zero_mask) {
2901 swizzled_result.writemask = zero_mask;
2902 emit(MOV(swizzled_result, src_reg(0.0f)));
2903 }
2904
2905 if (one_mask) {
2906 swizzled_result.writemask = one_mask;
2907 emit(MOV(swizzled_result, src_reg(1.0f)));
2908 }
2909 }
2910
2911 void
2912 vec4_visitor::visit(ir_return *)
2913 {
2914 unreachable("not reached");
2915 }
2916
2917 void
2918 vec4_visitor::visit(ir_discard *)
2919 {
2920 unreachable("not reached");
2921 }
2922
2923 void
2924 vec4_visitor::visit(ir_if *ir)
2925 {
2926 /* Don't point the annotation at the if statement, because then it plus
2927 * the then and else blocks get printed.
2928 */
2929 this->base_ir = ir->condition;
2930
2931 if (devinfo->gen == 6) {
2932 emit_if_gen6(ir);
2933 } else {
2934 enum brw_predicate predicate;
2935 emit_bool_to_cond_code(ir->condition, &predicate);
2936 emit(IF(predicate));
2937 }
2938
2939 visit_instructions(&ir->then_instructions);
2940
2941 if (!ir->else_instructions.is_empty()) {
2942 this->base_ir = ir->condition;
2943 emit(BRW_OPCODE_ELSE);
2944
2945 visit_instructions(&ir->else_instructions);
2946 }
2947
2948 this->base_ir = ir->condition;
2949 emit(BRW_OPCODE_ENDIF);
2950 }
2951
2952 void
2953 vec4_visitor::visit(ir_emit_vertex *)
2954 {
2955 unreachable("not reached");
2956 }
2957
2958 void
2959 vec4_visitor::visit(ir_end_primitive *)
2960 {
2961 unreachable("not reached");
2962 }
2963
2964 void
2965 vec4_visitor::visit(ir_barrier *)
2966 {
2967 unreachable("not reached");
2968 }
2969
2970 void
2971 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2972 dst_reg dst, src_reg offset,
2973 src_reg src0, src_reg src1)
2974 {
2975 unsigned mlen = 0;
2976
2977 /* Set the atomic operation offset. */
2978 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2979 mlen++;
2980
2981 /* Set the atomic operation arguments. */
2982 if (src0.file != BAD_FILE) {
2983 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2984 mlen++;
2985 }
2986
2987 if (src1.file != BAD_FILE) {
2988 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2989 mlen++;
2990 }
2991
2992 /* Emit the instruction. Note that this maps to the normal SIMD8
2993 * untyped atomic message on Ivy Bridge, but that's OK because
2994 * unused channels will be masked out.
2995 */
2996 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2997 brw_message_reg(0),
2998 src_reg(surf_index), src_reg(atomic_op));
2999 inst->mlen = mlen;
3000 }
3001
3002 void
3003 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3004 src_reg offset)
3005 {
3006 /* Set the surface read offset. */
3007 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3008
3009 /* Emit the instruction. Note that this maps to the normal SIMD8
3010 * untyped surface read message, but that's OK because unused
3011 * channels will be masked out.
3012 */
3013 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3014 brw_message_reg(0),
3015 src_reg(surf_index), src_reg(1));
3016 inst->mlen = 1;
3017 }
3018
3019 void
3020 vec4_visitor::emit_ndc_computation()
3021 {
3022 /* Get the position */
3023 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3024
3025 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3026 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3027 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3028
3029 current_annotation = "NDC";
3030 dst_reg ndc_w = ndc;
3031 ndc_w.writemask = WRITEMASK_W;
3032 src_reg pos_w = pos;
3033 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3034 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3035
3036 dst_reg ndc_xyz = ndc;
3037 ndc_xyz.writemask = WRITEMASK_XYZ;
3038
3039 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3040 }
3041
3042 void
3043 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3044 {
3045 if (devinfo->gen < 6 &&
3046 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3047 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3048 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3049 dst_reg header1_w = header1;
3050 header1_w.writemask = WRITEMASK_W;
3051
3052 emit(MOV(header1, 0u));
3053
3054 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3055 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3056
3057 current_annotation = "Point size";
3058 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3059 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3060 }
3061
3062 if (key->userclip_active) {
3063 current_annotation = "Clipping flags";
3064 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3065 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3066
3067 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3068 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3069 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3070
3071 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3072 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3073 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3074 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3075 }
3076
3077 /* i965 clipping workaround:
3078 * 1) Test for -ve rhw
3079 * 2) If set,
3080 * set ndc = (0,0,0,0)
3081 * set ucp[6] = 1
3082 *
3083 * Later, clipping will detect ucp[6] and ensure the primitive is
3084 * clipped against all fixed planes.
3085 */
3086 if (devinfo->has_negative_rhw_bug) {
3087 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3088 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3089 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3090 vec4_instruction *inst;
3091 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3092 inst->predicate = BRW_PREDICATE_NORMAL;
3093 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3094 inst->predicate = BRW_PREDICATE_NORMAL;
3095 }
3096
3097 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3098 } else if (devinfo->gen < 6) {
3099 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3100 } else {
3101 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3102 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3103 dst_reg reg_w = reg;
3104 reg_w.writemask = WRITEMASK_W;
3105 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3106 }
3107 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3108 dst_reg reg_y = reg;
3109 reg_y.writemask = WRITEMASK_Y;
3110 reg_y.type = BRW_REGISTER_TYPE_D;
3111 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3112 }
3113 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3114 dst_reg reg_z = reg;
3115 reg_z.writemask = WRITEMASK_Z;
3116 reg_z.type = BRW_REGISTER_TYPE_D;
3117 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3118 }
3119 }
3120 }
3121
3122 void
3123 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3124 {
3125 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3126 *
3127 * "If a linked set of shaders forming the vertex stage contains no
3128 * static write to gl_ClipVertex or gl_ClipDistance, but the
3129 * application has requested clipping against user clip planes through
3130 * the API, then the coordinate written to gl_Position is used for
3131 * comparison against the user clip planes."
3132 *
3133 * This function is only called if the shader didn't write to
3134 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3135 * if the user wrote to it; otherwise we use gl_Position.
3136 */
3137 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3138 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3139 clip_vertex = VARYING_SLOT_POS;
3140 }
3141
3142 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3143 ++i) {
3144 reg.writemask = 1 << i;
3145 emit(DP4(reg,
3146 src_reg(output_reg[clip_vertex]),
3147 src_reg(this->userplane[i + offset])));
3148 }
3149 }
3150
3151 vec4_instruction *
3152 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3153 {
3154 assert (varying < VARYING_SLOT_MAX);
3155 reg.type = output_reg[varying].type;
3156 current_annotation = output_reg_annotation[varying];
3157 /* Copy the register, saturating if necessary */
3158 return emit(MOV(reg, src_reg(output_reg[varying])));
3159 }
3160
3161 void
3162 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3163 {
3164 reg.type = BRW_REGISTER_TYPE_F;
3165
3166 switch (varying) {
3167 case VARYING_SLOT_PSIZ:
3168 {
3169 /* PSIZ is always in slot 0, and is coupled with other flags. */
3170 current_annotation = "indices, point width, clip flags";
3171 emit_psiz_and_flags(reg);
3172 break;
3173 }
3174 case BRW_VARYING_SLOT_NDC:
3175 current_annotation = "NDC";
3176 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3177 break;
3178 case VARYING_SLOT_POS:
3179 current_annotation = "gl_Position";
3180 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3181 break;
3182 case VARYING_SLOT_EDGE:
3183 /* This is present when doing unfilled polygons. We're supposed to copy
3184 * the edge flag from the user-provided vertex array
3185 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3186 * of that attribute (starts as 1.0f). This is then used in clipping to
3187 * determine which edges should be drawn as wireframe.
3188 */
3189 current_annotation = "edge flag";
3190 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3191 glsl_type::float_type, WRITEMASK_XYZW))));
3192 break;
3193 case BRW_VARYING_SLOT_PAD:
3194 /* No need to write to this slot */
3195 break;
3196 case VARYING_SLOT_COL0:
3197 case VARYING_SLOT_COL1:
3198 case VARYING_SLOT_BFC0:
3199 case VARYING_SLOT_BFC1: {
3200 /* These built-in varyings are only supported in compatibility mode,
3201 * and we only support GS in core profile. So, this must be a vertex
3202 * shader.
3203 */
3204 assert(stage == MESA_SHADER_VERTEX);
3205 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3206 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3207 inst->saturate = true;
3208 break;
3209 }
3210
3211 default:
3212 emit_generic_urb_slot(reg, varying);
3213 break;
3214 }
3215 }
3216
3217 static int
3218 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3219 {
3220 if (devinfo->gen >= 6) {
3221 /* URB data written (does not include the message header reg) must
3222 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3223 * section 5.4.3.2.2: URB_INTERLEAVED.
3224 *
3225 * URB entries are allocated on a multiple of 1024 bits, so an
3226 * extra 128 bits written here to make the end align to 256 is
3227 * no problem.
3228 */
3229 if ((mlen % 2) != 1)
3230 mlen++;
3231 }
3232
3233 return mlen;
3234 }
3235
3236
3237 /**
3238 * Generates the VUE payload plus the necessary URB write instructions to
3239 * output it.
3240 *
3241 * The VUE layout is documented in Volume 2a.
3242 */
3243 void
3244 vec4_visitor::emit_vertex()
3245 {
3246 /* MRF 0 is reserved for the debugger, so start with message header
3247 * in MRF 1.
3248 */
3249 int base_mrf = 1;
3250 int mrf = base_mrf;
3251 /* In the process of generating our URB write message contents, we
3252 * may need to unspill a register or load from an array. Those
3253 * reads would use MRFs 14-15.
3254 */
3255 int max_usable_mrf = 13;
3256
3257 /* The following assertion verifies that max_usable_mrf causes an
3258 * even-numbered amount of URB write data, which will meet gen6's
3259 * requirements for length alignment.
3260 */
3261 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3262
3263 /* First mrf is the g0-based message header containing URB handles and
3264 * such.
3265 */
3266 emit_urb_write_header(mrf++);
3267
3268 if (devinfo->gen < 6) {
3269 emit_ndc_computation();
3270 }
3271
3272 /* Lower legacy ff and ClipVertex clipping to clip distances */
3273 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3274 current_annotation = "user clip distances";
3275
3276 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3277 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3278
3279 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3280 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3281 }
3282
3283 /* We may need to split this up into several URB writes, so do them in a
3284 * loop.
3285 */
3286 int slot = 0;
3287 bool complete = false;
3288 do {
3289 /* URB offset is in URB row increments, and each of our MRFs is half of
3290 * one of those, since we're doing interleaved writes.
3291 */
3292 int offset = slot / 2;
3293
3294 mrf = base_mrf + 1;
3295 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3296 emit_urb_slot(dst_reg(MRF, mrf++),
3297 prog_data->vue_map.slot_to_varying[slot]);
3298
3299 /* If this was max_usable_mrf, we can't fit anything more into this
3300 * URB WRITE.
3301 */
3302 if (mrf > max_usable_mrf) {
3303 slot++;
3304 break;
3305 }
3306 }
3307
3308 complete = slot >= prog_data->vue_map.num_slots;
3309 current_annotation = "URB write";
3310 vec4_instruction *inst = emit_urb_write_opcode(complete);
3311 inst->base_mrf = base_mrf;
3312 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3313 inst->offset += offset;
3314 } while(!complete);
3315 }
3316
3317
3318 src_reg
3319 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3320 src_reg *reladdr, int reg_offset)
3321 {
3322 /* Because we store the values to scratch interleaved like our
3323 * vertex data, we need to scale the vec4 index by 2.
3324 */
3325 int message_header_scale = 2;
3326
3327 /* Pre-gen6, the message header uses byte offsets instead of vec4
3328 * (16-byte) offset units.
3329 */
3330 if (devinfo->gen < 6)
3331 message_header_scale *= 16;
3332
3333 if (reladdr) {
3334 src_reg index = src_reg(this, glsl_type::int_type);
3335
3336 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3337 src_reg(reg_offset)));
3338 emit_before(block, inst, MUL(dst_reg(index), index,
3339 src_reg(message_header_scale)));
3340
3341 return index;
3342 } else {
3343 return src_reg(reg_offset * message_header_scale);
3344 }
3345 }
3346
3347 src_reg
3348 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3349 src_reg *reladdr, int reg_offset)
3350 {
3351 if (reladdr) {
3352 src_reg index = src_reg(this, glsl_type::int_type);
3353
3354 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3355 src_reg(reg_offset)));
3356
3357 /* Pre-gen6, the message header uses byte offsets instead of vec4
3358 * (16-byte) offset units.
3359 */
3360 if (devinfo->gen < 6) {
3361 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3362 }
3363
3364 return index;
3365 } else if (devinfo->gen >= 8) {
3366 /* Store the offset in a GRF so we can send-from-GRF. */
3367 src_reg offset = src_reg(this, glsl_type::int_type);
3368 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3369 return offset;
3370 } else {
3371 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3372 return src_reg(reg_offset * message_header_scale);
3373 }
3374 }
3375
3376 /**
3377 * Emits an instruction before @inst to load the value named by @orig_src
3378 * from scratch space at @base_offset to @temp.
3379 *
3380 * @base_offset is measured in 32-byte units (the size of a register).
3381 */
3382 void
3383 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3384 dst_reg temp, src_reg orig_src,
3385 int base_offset)
3386 {
3387 int reg_offset = base_offset + orig_src.reg_offset;
3388 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3389 reg_offset);
3390
3391 emit_before(block, inst, SCRATCH_READ(temp, index));
3392 }
3393
3394 /**
3395 * Emits an instruction after @inst to store the value to be written
3396 * to @orig_dst to scratch space at @base_offset, from @temp.
3397 *
3398 * @base_offset is measured in 32-byte units (the size of a register).
3399 */
3400 void
3401 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3402 int base_offset)
3403 {
3404 int reg_offset = base_offset + inst->dst.reg_offset;
3405 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3406 reg_offset);
3407
3408 /* Create a temporary register to store *inst's result in.
3409 *
3410 * We have to be careful in MOVing from our temporary result register in
3411 * the scratch write. If we swizzle from channels of the temporary that
3412 * weren't initialized, it will confuse live interval analysis, which will
3413 * make spilling fail to make progress.
3414 */
3415 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3416 inst->dst.type),
3417 brw_swizzle_for_mask(inst->dst.writemask));
3418 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3419 inst->dst.writemask));
3420 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3421 write->predicate = inst->predicate;
3422 write->ir = inst->ir;
3423 write->annotation = inst->annotation;
3424 inst->insert_after(block, write);
3425
3426 inst->dst.file = temp.file;
3427 inst->dst.reg = temp.reg;
3428 inst->dst.reg_offset = temp.reg_offset;
3429 inst->dst.reladdr = NULL;
3430 }
3431
3432 /**
3433 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3434 * adds the scratch read(s) before \p inst. The function also checks for
3435 * recursive reladdr scratch accesses, issuing the corresponding scratch
3436 * loads and rewriting reladdr references accordingly.
3437 *
3438 * \return \p src if it did not require a scratch load, otherwise, the
3439 * register holding the result of the scratch load that the caller should
3440 * use to rewrite src.
3441 */
3442 src_reg
3443 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3444 vec4_instruction *inst, src_reg src)
3445 {
3446 /* Resolve recursive reladdr scratch access by calling ourselves
3447 * with src.reladdr
3448 */
3449 if (src.reladdr)
3450 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3451 *src.reladdr);
3452
3453 /* Now handle scratch access on src */
3454 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3455 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3456 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3457 src.reg = temp.reg;
3458 src.reg_offset = temp.reg_offset;
3459 src.reladdr = NULL;
3460 }
3461
3462 return src;
3463 }
3464
3465 /**
3466 * We can't generally support array access in GRF space, because a
3467 * single instruction's destination can only span 2 contiguous
3468 * registers. So, we send all GRF arrays that get variable index
3469 * access to scratch space.
3470 */
3471 void
3472 vec4_visitor::move_grf_array_access_to_scratch()
3473 {
3474 int scratch_loc[this->alloc.count];
3475 memset(scratch_loc, -1, sizeof(scratch_loc));
3476
3477 /* First, calculate the set of virtual GRFs that need to be punted
3478 * to scratch due to having any array access on them, and where in
3479 * scratch.
3480 */
3481 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3482 if (inst->dst.file == GRF && inst->dst.reladdr) {
3483 if (scratch_loc[inst->dst.reg] == -1) {
3484 scratch_loc[inst->dst.reg] = last_scratch;
3485 last_scratch += this->alloc.sizes[inst->dst.reg];
3486 }
3487
3488 for (src_reg *iter = inst->dst.reladdr;
3489 iter->reladdr;
3490 iter = iter->reladdr) {
3491 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3492 scratch_loc[iter->reg] = last_scratch;
3493 last_scratch += this->alloc.sizes[iter->reg];
3494 }
3495 }
3496 }
3497
3498 for (int i = 0 ; i < 3; i++) {
3499 for (src_reg *iter = &inst->src[i];
3500 iter->reladdr;
3501 iter = iter->reladdr) {
3502 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3503 scratch_loc[iter->reg] = last_scratch;
3504 last_scratch += this->alloc.sizes[iter->reg];
3505 }
3506 }
3507 }
3508 }
3509
3510 /* Now, for anything that will be accessed through scratch, rewrite
3511 * it to load/store. Note that this is a _safe list walk, because
3512 * we may generate a new scratch_write instruction after the one
3513 * we're processing.
3514 */
3515 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3516 /* Set up the annotation tracking for new generated instructions. */
3517 base_ir = inst->ir;
3518 current_annotation = inst->annotation;
3519
3520 /* First handle scratch access on the dst. Notice we have to handle
3521 * the case where the dst's reladdr also points to scratch space.
3522 */
3523 if (inst->dst.reladdr)
3524 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3525 *inst->dst.reladdr);
3526
3527 /* Now that we have handled any (possibly recursive) reladdr scratch
3528 * accesses for dst we can safely do the scratch write for dst itself
3529 */
3530 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3531 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3532
3533 /* Now handle scratch access on any src. In this case, since inst->src[i]
3534 * already is a src_reg, we can just call emit_resolve_reladdr with
3535 * inst->src[i] and it will take care of handling scratch loads for
3536 * both src and src.reladdr (recursively).
3537 */
3538 for (int i = 0 ; i < 3; i++) {
3539 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3540 inst->src[i]);
3541 }
3542 }
3543 }
3544
3545 /**
3546 * Emits an instruction before @inst to load the value named by @orig_src
3547 * from the pull constant buffer (surface) at @base_offset to @temp.
3548 */
3549 void
3550 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3551 dst_reg temp, src_reg orig_src,
3552 int base_offset)
3553 {
3554 int reg_offset = base_offset + orig_src.reg_offset;
3555 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3556 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3557 reg_offset);
3558
3559 emit_pull_constant_load_reg(temp,
3560 index,
3561 offset,
3562 block, inst);
3563 }
3564
3565 /**
3566 * Implements array access of uniforms by inserting a
3567 * PULL_CONSTANT_LOAD instruction.
3568 *
3569 * Unlike temporary GRF array access (where we don't support it due to
3570 * the difficulty of doing relative addressing on instruction
3571 * destinations), we could potentially do array access of uniforms
3572 * that were loaded in GRF space as push constants. In real-world
3573 * usage we've seen, though, the arrays being used are always larger
3574 * than we could load as push constants, so just always move all
3575 * uniform array access out to a pull constant buffer.
3576 */
3577 void
3578 vec4_visitor::move_uniform_array_access_to_pull_constants()
3579 {
3580 int pull_constant_loc[this->uniforms];
3581 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3582 bool nested_reladdr;
3583
3584 /* Walk through and find array access of uniforms. Put a copy of that
3585 * uniform in the pull constant buffer.
3586 *
3587 * Note that we don't move constant-indexed accesses to arrays. No
3588 * testing has been done of the performance impact of this choice.
3589 */
3590 do {
3591 nested_reladdr = false;
3592
3593 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3594 for (int i = 0 ; i < 3; i++) {
3595 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3596 continue;
3597
3598 int uniform = inst->src[i].reg;
3599
3600 if (inst->src[i].reladdr->reladdr)
3601 nested_reladdr = true; /* will need another pass */
3602
3603 /* If this array isn't already present in the pull constant buffer,
3604 * add it.
3605 */
3606 if (pull_constant_loc[uniform] == -1) {
3607 const gl_constant_value **values =
3608 &stage_prog_data->param[uniform * 4];
3609
3610 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3611
3612 assert(uniform < uniform_array_size);
3613 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3614 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3615 = values[j];
3616 }
3617 }
3618
3619 /* Set up the annotation tracking for new generated instructions. */
3620 base_ir = inst->ir;
3621 current_annotation = inst->annotation;
3622
3623 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3624
3625 emit_pull_constant_load(block, inst, temp, inst->src[i],
3626 pull_constant_loc[uniform]);
3627
3628 inst->src[i].file = temp.file;
3629 inst->src[i].reg = temp.reg;
3630 inst->src[i].reg_offset = temp.reg_offset;
3631 inst->src[i].reladdr = NULL;
3632 }
3633 }
3634 } while (nested_reladdr);
3635
3636 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3637 * no need to track them as larger-than-vec4 objects. This will be
3638 * relied on in cutting out unused uniform vectors from push
3639 * constants.
3640 */
3641 split_uniform_registers();
3642 }
3643
3644 void
3645 vec4_visitor::resolve_ud_negate(src_reg *reg)
3646 {
3647 if (reg->type != BRW_REGISTER_TYPE_UD ||
3648 !reg->negate)
3649 return;
3650
3651 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3652 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3653 *reg = temp;
3654 }
3655
3656 /**
3657 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3658 *
3659 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3660 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3661 */
3662 void
3663 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3664 {
3665 assert(devinfo->gen <= 5);
3666
3667 if (!rvalue->type->is_boolean())
3668 return;
3669
3670 src_reg and_result = src_reg(this, rvalue->type);
3671 src_reg neg_result = src_reg(this, rvalue->type);
3672 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3673 emit(MOV(dst_reg(neg_result), negate(and_result)));
3674 *reg = neg_result;
3675 }
3676
3677 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3678 void *log_data,
3679 struct gl_program *prog,
3680 const struct brw_vue_prog_key *key,
3681 struct brw_vue_prog_data *prog_data,
3682 struct gl_shader_program *shader_prog,
3683 gl_shader_stage stage,
3684 void *mem_ctx,
3685 bool no_spills,
3686 int shader_time_index)
3687 : backend_shader(compiler, log_data, mem_ctx,
3688 shader_prog, prog, &prog_data->base, stage),
3689 key(key),
3690 prog_data(prog_data),
3691 sanity_param_count(0),
3692 fail_msg(NULL),
3693 first_non_payload_grf(0),
3694 need_all_constants_in_pull_buffer(false),
3695 no_spills(no_spills),
3696 shader_time_index(shader_time_index),
3697 last_scratch(0)
3698 {
3699 this->failed = false;
3700
3701 this->base_ir = NULL;
3702 this->current_annotation = NULL;
3703 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3704
3705 this->variable_ht = hash_table_ctor(0,
3706 hash_table_pointer_hash,
3707 hash_table_pointer_compare);
3708
3709 this->virtual_grf_start = NULL;
3710 this->virtual_grf_end = NULL;
3711 this->live_intervals = NULL;
3712
3713 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3714
3715 this->uniforms = 0;
3716
3717 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3718 * at least one. See setup_uniforms() in brw_vec4.cpp.
3719 */
3720 this->uniform_array_size = 1;
3721 if (prog_data) {
3722 this->uniform_array_size =
3723 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3724 }
3725
3726 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3727 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3728 }
3729
3730 vec4_visitor::~vec4_visitor()
3731 {
3732 hash_table_dtor(this->variable_ht);
3733 }
3734
3735
3736 void
3737 vec4_visitor::fail(const char *format, ...)
3738 {
3739 va_list va;
3740 char *msg;
3741
3742 if (failed)
3743 return;
3744
3745 failed = true;
3746
3747 va_start(va, format);
3748 msg = ralloc_vasprintf(mem_ctx, format, va);
3749 va_end(va);
3750 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3751
3752 this->fail_msg = msg;
3753
3754 if (debug_enabled) {
3755 fprintf(stderr, "%s", msg);
3756 }
3757 }
3758
3759 } /* namespace brw */