i965/vec4: Change vec4_visitor::gather_channel() method to allow reuse
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 vec4_instruction *
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 math = emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358
359 return math;
360 }
361
362 void
363 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
364 {
365 if (devinfo->gen < 7) {
366 unreachable("ir_unop_pack_half_2x16 should be lowered");
367 }
368
369 assert(dst.type == BRW_REGISTER_TYPE_UD);
370 assert(src0.type == BRW_REGISTER_TYPE_F);
371
372 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
373 *
374 * Because this instruction does not have a 16-bit floating-point type,
375 * the destination data type must be Word (W).
376 *
377 * The destination must be DWord-aligned and specify a horizontal stride
378 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
379 * each destination channel and the upper word is not modified.
380 *
381 * The above restriction implies that the f32to16 instruction must use
382 * align1 mode, because only in align1 mode is it possible to specify
383 * horizontal stride. We choose here to defy the hardware docs and emit
384 * align16 instructions.
385 *
386 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
387 * instructions. I was partially successful in that the code passed all
388 * tests. However, the code was dubiously correct and fragile, and the
389 * tests were not harsh enough to probe that frailty. Not trusting the
390 * code, I chose instead to remain in align16 mode in defiance of the hw
391 * docs).
392 *
393 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
394 * simulator, emitting a f32to16 in align16 mode with UD as destination
395 * data type is safe. The behavior differs from that specified in the PRM
396 * in that the upper word of each destination channel is cleared to 0.
397 */
398
399 dst_reg tmp_dst(this, glsl_type::uvec2_type);
400 src_reg tmp_src(tmp_dst);
401
402 #if 0
403 /* Verify the undocumented behavior on which the following instructions
404 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
405 * then the result of the bit-or instruction below will be incorrect.
406 *
407 * You should inspect the disasm output in order to verify that the MOV is
408 * not optimized away.
409 */
410 emit(MOV(tmp_dst, src_reg(0x12345678u)));
411 #endif
412
413 /* Give tmp the form below, where "." means untouched.
414 *
415 * w z y x w z y x
416 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
417 *
418 * That the upper word of each write-channel be 0 is required for the
419 * following bit-shift and bit-or instructions to work. Note that this
420 * relies on the undocumented hardware behavior mentioned above.
421 */
422 tmp_dst.writemask = WRITEMASK_XY;
423 emit(F32TO16(tmp_dst, src0));
424
425 /* Give the write-channels of dst the form:
426 * 0xhhhh0000
427 */
428 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
429 emit(SHL(dst, tmp_src, src_reg(16u)));
430
431 /* Finally, give the write-channels of dst the form of packHalf2x16's
432 * output:
433 * 0xhhhhllll
434 */
435 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
436 emit(OR(dst, src_reg(dst), tmp_src));
437 }
438
439 void
440 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
441 {
442 if (devinfo->gen < 7) {
443 unreachable("ir_unop_unpack_half_2x16 should be lowered");
444 }
445
446 assert(dst.type == BRW_REGISTER_TYPE_F);
447 assert(src0.type == BRW_REGISTER_TYPE_UD);
448
449 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
450 *
451 * Because this instruction does not have a 16-bit floating-point type,
452 * the source data type must be Word (W). The destination type must be
453 * F (Float).
454 *
455 * To use W as the source data type, we must adjust horizontal strides,
456 * which is only possible in align1 mode. All my [chadv] attempts at
457 * emitting align1 instructions for unpackHalf2x16 failed to pass the
458 * Piglit tests, so I gave up.
459 *
460 * I've verified that, on gen7 hardware and the simulator, it is safe to
461 * emit f16to32 in align16 mode with UD as source data type.
462 */
463
464 dst_reg tmp_dst(this, glsl_type::uvec2_type);
465 src_reg tmp_src(tmp_dst);
466
467 tmp_dst.writemask = WRITEMASK_X;
468 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
469
470 tmp_dst.writemask = WRITEMASK_Y;
471 emit(SHR(tmp_dst, src0, src_reg(16u)));
472
473 dst.writemask = WRITEMASK_XY;
474 emit(F16TO32(dst, tmp_src));
475 }
476
477 void
478 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
479 {
480 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
481 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
482 * is not suitable to generate the shift values, but we can use the packed
483 * vector float and a type-converting MOV.
484 */
485 dst_reg shift(this, glsl_type::uvec4_type);
486 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
487
488 dst_reg shifted(this, glsl_type::uvec4_type);
489 src0.swizzle = BRW_SWIZZLE_XXXX;
490 emit(SHR(shifted, src0, src_reg(shift)));
491
492 shifted.type = BRW_REGISTER_TYPE_UB;
493 dst_reg f(this, glsl_type::vec4_type);
494 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
495
496 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
497 }
498
499 void
500 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
501 {
502 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
503 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
504 * is not suitable to generate the shift values, but we can use the packed
505 * vector float and a type-converting MOV.
506 */
507 dst_reg shift(this, glsl_type::uvec4_type);
508 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
509
510 dst_reg shifted(this, glsl_type::uvec4_type);
511 src0.swizzle = BRW_SWIZZLE_XXXX;
512 emit(SHR(shifted, src0, src_reg(shift)));
513
514 shifted.type = BRW_REGISTER_TYPE_B;
515 dst_reg f(this, glsl_type::vec4_type);
516 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
517
518 dst_reg scaled(this, glsl_type::vec4_type);
519 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
520
521 dst_reg max(this, glsl_type::vec4_type);
522 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
523 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
524 }
525
526 void
527 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
528 {
529 dst_reg saturated(this, glsl_type::vec4_type);
530 vec4_instruction *inst = emit(MOV(saturated, src0));
531 inst->saturate = true;
532
533 dst_reg scaled(this, glsl_type::vec4_type);
534 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
535
536 dst_reg rounded(this, glsl_type::vec4_type);
537 emit(RNDE(rounded, src_reg(scaled)));
538
539 dst_reg u(this, glsl_type::uvec4_type);
540 emit(MOV(u, src_reg(rounded)));
541
542 src_reg bytes(u);
543 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
544 }
545
546 void
547 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
548 {
549 dst_reg max(this, glsl_type::vec4_type);
550 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
551
552 dst_reg min(this, glsl_type::vec4_type);
553 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
554
555 dst_reg scaled(this, glsl_type::vec4_type);
556 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
557
558 dst_reg rounded(this, glsl_type::vec4_type);
559 emit(RNDE(rounded, src_reg(scaled)));
560
561 dst_reg i(this, glsl_type::ivec4_type);
562 emit(MOV(i, src_reg(rounded)));
563
564 src_reg bytes(i);
565 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
566 }
567
568 void
569 vec4_visitor::visit_instructions(const exec_list *list)
570 {
571 foreach_in_list(ir_instruction, ir, list) {
572 base_ir = ir;
573 ir->accept(this);
574 }
575 }
576
577 /**
578 * Returns the minimum number of vec4 elements needed to pack a type.
579 *
580 * For simple types, it will return 1 (a single vec4); for matrices, the
581 * number of columns; for array and struct, the sum of the vec4_size of
582 * each of its elements; and for sampler and atomic, zero.
583 *
584 * This method is useful to calculate how much register space is needed to
585 * store a particular type.
586 */
587 int
588 vec4_visitor::type_size(const struct glsl_type *type)
589 {
590 unsigned int i;
591 int size;
592
593 switch (type->base_type) {
594 case GLSL_TYPE_UINT:
595 case GLSL_TYPE_INT:
596 case GLSL_TYPE_FLOAT:
597 case GLSL_TYPE_BOOL:
598 if (type->is_matrix()) {
599 return type->matrix_columns;
600 } else {
601 /* Regardless of size of vector, it gets a vec4. This is bad
602 * packing for things like floats, but otherwise arrays become a
603 * mess. Hopefully a later pass over the code can pack scalars
604 * down if appropriate.
605 */
606 return 1;
607 }
608 case GLSL_TYPE_ARRAY:
609 assert(type->length > 0);
610 return type_size(type->fields.array) * type->length;
611 case GLSL_TYPE_STRUCT:
612 size = 0;
613 for (i = 0; i < type->length; i++) {
614 size += type_size(type->fields.structure[i].type);
615 }
616 return size;
617 case GLSL_TYPE_SUBROUTINE:
618 return 1;
619
620 case GLSL_TYPE_SAMPLER:
621 /* Samplers take up no register space, since they're baked in at
622 * link time.
623 */
624 return 0;
625 case GLSL_TYPE_ATOMIC_UINT:
626 return 0;
627 case GLSL_TYPE_IMAGE:
628 case GLSL_TYPE_VOID:
629 case GLSL_TYPE_DOUBLE:
630 case GLSL_TYPE_ERROR:
631 case GLSL_TYPE_INTERFACE:
632 unreachable("not reached");
633 }
634
635 return 0;
636 }
637
638 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
639 {
640 init();
641
642 this->file = GRF;
643 this->reg = v->alloc.allocate(v->type_size(type));
644
645 if (type->is_array() || type->is_record()) {
646 this->swizzle = BRW_SWIZZLE_NOOP;
647 } else {
648 this->swizzle = brw_swizzle_for_size(type->vector_elements);
649 }
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
655 {
656 assert(size > 0);
657
658 init();
659
660 this->file = GRF;
661 this->reg = v->alloc.allocate(v->type_size(type) * size);
662
663 this->swizzle = BRW_SWIZZLE_NOOP;
664
665 this->type = brw_type_for_base_type(type);
666 }
667
668 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
669 {
670 init();
671
672 this->file = GRF;
673 this->reg = v->alloc.allocate(v->type_size(type));
674
675 if (type->is_array() || type->is_record()) {
676 this->writemask = WRITEMASK_XYZW;
677 } else {
678 this->writemask = (1 << type->vector_elements) - 1;
679 }
680
681 this->type = brw_type_for_base_type(type);
682 }
683
684 void
685 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
686 unsigned n)
687 {
688 static const gl_constant_value zero = { 0 };
689
690 for (unsigned i = 0; i < n; ++i)
691 stage_prog_data->param[4 * uniforms + i] = &values[i];
692
693 for (unsigned i = n; i < 4; ++i)
694 stage_prog_data->param[4 * uniforms + i] = &zero;
695
696 uniform_vector_size[uniforms++] = n;
697 }
698
699 /* Our support for uniforms is piggy-backed on the struct
700 * gl_fragment_program, because that's where the values actually
701 * get stored, rather than in some global gl_shader_program uniform
702 * store.
703 */
704 void
705 vec4_visitor::setup_uniform_values(ir_variable *ir)
706 {
707 int namelen = strlen(ir->name);
708
709 /* The data for our (non-builtin) uniforms is stored in a series of
710 * gl_uniform_driver_storage structs for each subcomponent that
711 * glGetUniformLocation() could name. We know it's been set up in the same
712 * order we'd walk the type, so walk the list of storage and find anything
713 * with our name, or the prefix of a component that starts with our name.
714 */
715 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
716 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
717
718 if (storage->builtin)
719 continue;
720
721 if (strncmp(ir->name, storage->name, namelen) != 0 ||
722 (storage->name[namelen] != 0 &&
723 storage->name[namelen] != '.' &&
724 storage->name[namelen] != '[')) {
725 continue;
726 }
727
728 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
729 storage->type->matrix_columns);
730 const unsigned vector_size = storage->type->vector_elements;
731
732 for (unsigned s = 0; s < vector_count; s++)
733 setup_vector_uniform_values(&storage->storage[s * vector_size],
734 vector_size);
735 }
736 }
737
738 void
739 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
740 {
741 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
742 assert(this->uniforms < uniform_array_size);
743 this->uniform_vector_size[this->uniforms] = 4;
744 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
745 this->userplane[i].type = BRW_REGISTER_TYPE_F;
746 for (int j = 0; j < 4; ++j) {
747 stage_prog_data->param[this->uniforms * 4 + j] =
748 (gl_constant_value *) &clip_planes[i][j];
749 }
750 ++this->uniforms;
751 }
752 }
753
754 /* Our support for builtin uniforms is even scarier than non-builtin.
755 * It sits on top of the PROG_STATE_VAR parameters that are
756 * automatically updated from GL context state.
757 */
758 void
759 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
760 {
761 const ir_state_slot *const slots = ir->get_state_slots();
762 assert(slots != NULL);
763
764 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
765 /* This state reference has already been setup by ir_to_mesa,
766 * but we'll get the same index back here. We can reference
767 * ParameterValues directly, since unlike brw_fs.cpp, we never
768 * add new state references during compile.
769 */
770 int index = _mesa_add_state_reference(this->prog->Parameters,
771 (gl_state_index *)slots[i].tokens);
772 gl_constant_value *values =
773 &this->prog->Parameters->ParameterValues[index][0];
774
775 assert(this->uniforms < uniform_array_size);
776
777 for (unsigned j = 0; j < 4; j++)
778 stage_prog_data->param[this->uniforms * 4 + j] =
779 &values[GET_SWZ(slots[i].swizzle, j)];
780
781 this->uniform_vector_size[this->uniforms] =
782 (ir->type->is_scalar() || ir->type->is_vector() ||
783 ir->type->is_matrix() ? ir->type->vector_elements : 4);
784
785 this->uniforms++;
786 }
787 }
788
789 dst_reg *
790 vec4_visitor::variable_storage(ir_variable *var)
791 {
792 return (dst_reg *)hash_table_find(this->variable_ht, var);
793 }
794
795 void
796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
797 enum brw_predicate *predicate)
798 {
799 ir_expression *expr = ir->as_expression();
800
801 *predicate = BRW_PREDICATE_NORMAL;
802
803 if (expr && expr->operation != ir_binop_ubo_load) {
804 src_reg op[3];
805 vec4_instruction *inst;
806
807 assert(expr->get_num_operands() <= 3);
808 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
809 expr->operands[i]->accept(this);
810 op[i] = this->result;
811
812 resolve_ud_negate(&op[i]);
813 }
814
815 switch (expr->operation) {
816 case ir_unop_logic_not:
817 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
818 inst->conditional_mod = BRW_CONDITIONAL_Z;
819 break;
820
821 case ir_binop_logic_xor:
822 if (devinfo->gen <= 5) {
823 src_reg temp = src_reg(this, ir->type);
824 emit(XOR(dst_reg(temp), op[0], op[1]));
825 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
826 } else {
827 inst = emit(XOR(dst_null_d(), op[0], op[1]));
828 }
829 inst->conditional_mod = BRW_CONDITIONAL_NZ;
830 break;
831
832 case ir_binop_logic_or:
833 if (devinfo->gen <= 5) {
834 src_reg temp = src_reg(this, ir->type);
835 emit(OR(dst_reg(temp), op[0], op[1]));
836 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
837 } else {
838 inst = emit(OR(dst_null_d(), op[0], op[1]));
839 }
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 break;
842
843 case ir_binop_logic_and:
844 if (devinfo->gen <= 5) {
845 src_reg temp = src_reg(this, ir->type);
846 emit(AND(dst_reg(temp), op[0], op[1]));
847 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
848 } else {
849 inst = emit(AND(dst_null_d(), op[0], op[1]));
850 }
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 break;
853
854 case ir_unop_f2b:
855 if (devinfo->gen >= 6) {
856 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
857 } else {
858 inst = emit(MOV(dst_null_f(), op[0]));
859 inst->conditional_mod = BRW_CONDITIONAL_NZ;
860 }
861 break;
862
863 case ir_unop_i2b:
864 if (devinfo->gen >= 6) {
865 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
866 } else {
867 inst = emit(MOV(dst_null_d(), op[0]));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 }
870 break;
871
872 case ir_binop_all_equal:
873 if (devinfo->gen <= 5) {
874 resolve_bool_comparison(expr->operands[0], &op[0]);
875 resolve_bool_comparison(expr->operands[1], &op[1]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
878 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
879 break;
880
881 case ir_binop_any_nequal:
882 if (devinfo->gen <= 5) {
883 resolve_bool_comparison(expr->operands[0], &op[0]);
884 resolve_bool_comparison(expr->operands[1], &op[1]);
885 }
886 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
887 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
888 break;
889
890 case ir_unop_any:
891 if (devinfo->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 }
894 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
895 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
896 break;
897
898 case ir_binop_greater:
899 case ir_binop_gequal:
900 case ir_binop_less:
901 case ir_binop_lequal:
902 case ir_binop_equal:
903 case ir_binop_nequal:
904 if (devinfo->gen <= 5) {
905 resolve_bool_comparison(expr->operands[0], &op[0]);
906 resolve_bool_comparison(expr->operands[1], &op[1]);
907 }
908 emit(CMP(dst_null_d(), op[0], op[1],
909 brw_conditional_for_comparison(expr->operation)));
910 break;
911
912 case ir_triop_csel: {
913 /* Expand the boolean condition into the flag register. */
914 inst = emit(MOV(dst_null_d(), op[0]));
915 inst->conditional_mod = BRW_CONDITIONAL_NZ;
916
917 /* Select which boolean to return. */
918 dst_reg temp(this, expr->operands[1]->type);
919 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
920 inst->predicate = BRW_PREDICATE_NORMAL;
921
922 /* Expand the result to a condition code. */
923 inst = emit(MOV(dst_null_d(), src_reg(temp)));
924 inst->conditional_mod = BRW_CONDITIONAL_NZ;
925 break;
926 }
927
928 default:
929 unreachable("not reached");
930 }
931 return;
932 }
933
934 ir->accept(this);
935
936 resolve_ud_negate(&this->result);
937
938 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
939 inst->conditional_mod = BRW_CONDITIONAL_NZ;
940 }
941
942 /**
943 * Emit a gen6 IF statement with the comparison folded into the IF
944 * instruction.
945 */
946 void
947 vec4_visitor::emit_if_gen6(ir_if *ir)
948 {
949 ir_expression *expr = ir->condition->as_expression();
950
951 if (expr && expr->operation != ir_binop_ubo_load) {
952 src_reg op[3];
953 dst_reg temp;
954
955 assert(expr->get_num_operands() <= 3);
956 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
957 expr->operands[i]->accept(this);
958 op[i] = this->result;
959 }
960
961 switch (expr->operation) {
962 case ir_unop_logic_not:
963 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
964 return;
965
966 case ir_binop_logic_xor:
967 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_binop_logic_or:
971 temp = dst_reg(this, glsl_type::bool_type);
972 emit(OR(temp, op[0], op[1]));
973 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
974 return;
975
976 case ir_binop_logic_and:
977 temp = dst_reg(this, glsl_type::bool_type);
978 emit(AND(temp, op[0], op[1]));
979 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
980 return;
981
982 case ir_unop_f2b:
983 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
984 return;
985
986 case ir_unop_i2b:
987 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
988 return;
989
990 case ir_binop_greater:
991 case ir_binop_gequal:
992 case ir_binop_less:
993 case ir_binop_lequal:
994 case ir_binop_equal:
995 case ir_binop_nequal:
996 emit(IF(op[0], op[1],
997 brw_conditional_for_comparison(expr->operation)));
998 return;
999
1000 case ir_binop_all_equal:
1001 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1002 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1003 return;
1004
1005 case ir_binop_any_nequal:
1006 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1007 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1008 return;
1009
1010 case ir_unop_any:
1011 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1012 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1013 return;
1014
1015 case ir_triop_csel: {
1016 /* Expand the boolean condition into the flag register. */
1017 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1018 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1019
1020 /* Select which boolean to return. */
1021 dst_reg temp(this, expr->operands[1]->type);
1022 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1023 inst->predicate = BRW_PREDICATE_NORMAL;
1024
1025 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1026 return;
1027 }
1028
1029 default:
1030 unreachable("not reached");
1031 }
1032 return;
1033 }
1034
1035 ir->condition->accept(this);
1036
1037 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_variable *ir)
1042 {
1043 dst_reg *reg = NULL;
1044
1045 if (variable_storage(ir))
1046 return;
1047
1048 switch (ir->data.mode) {
1049 case ir_var_shader_in:
1050 assert(ir->data.location != -1);
1051 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1052 break;
1053
1054 case ir_var_shader_out:
1055 assert(ir->data.location != -1);
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057
1058 for (int i = 0; i < type_size(ir->type); i++) {
1059 output_reg[ir->data.location + i] = *reg;
1060 output_reg[ir->data.location + i].reg_offset = i;
1061 output_reg_annotation[ir->data.location + i] = ir->name;
1062 }
1063 break;
1064
1065 case ir_var_auto:
1066 case ir_var_temporary:
1067 reg = new(mem_ctx) dst_reg(this, ir->type);
1068 break;
1069
1070 case ir_var_uniform:
1071 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1072
1073 /* Thanks to the lower_ubo_reference pass, we will see only
1074 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1075 * variables, so no need for them to be in variable_ht.
1076 *
1077 * Some uniforms, such as samplers and atomic counters, have no actual
1078 * storage, so we should ignore them.
1079 */
1080 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1081 return;
1082
1083 /* Track how big the whole uniform variable is, in case we need to put a
1084 * copy of its data into pull constants for array access.
1085 */
1086 assert(this->uniforms < uniform_array_size);
1087 this->uniform_size[this->uniforms] = type_size(ir->type);
1088
1089 if (!strncmp(ir->name, "gl_", 3)) {
1090 setup_builtin_uniform_values(ir);
1091 } else {
1092 setup_uniform_values(ir);
1093 }
1094 break;
1095
1096 case ir_var_system_value:
1097 reg = make_reg_for_system_value(ir->data.location, ir->type);
1098 break;
1099
1100 default:
1101 unreachable("not reached");
1102 }
1103
1104 reg->type = brw_type_for_base_type(ir->type);
1105 hash_table_insert(this->variable_ht, reg, ir);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop *ir)
1110 {
1111 /* We don't want debugging output to print the whole body of the
1112 * loop as the annotation.
1113 */
1114 this->base_ir = NULL;
1115
1116 emit(BRW_OPCODE_DO);
1117
1118 visit_instructions(&ir->body_instructions);
1119
1120 emit(BRW_OPCODE_WHILE);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop_jump *ir)
1125 {
1126 switch (ir->mode) {
1127 case ir_loop_jump::jump_break:
1128 emit(BRW_OPCODE_BREAK);
1129 break;
1130 case ir_loop_jump::jump_continue:
1131 emit(BRW_OPCODE_CONTINUE);
1132 break;
1133 }
1134 }
1135
1136
1137 void
1138 vec4_visitor::visit(ir_function_signature *)
1139 {
1140 unreachable("not reached");
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_function *ir)
1145 {
1146 /* Ignore function bodies other than main() -- we shouldn't see calls to
1147 * them since they should all be inlined.
1148 */
1149 if (strcmp(ir->name, "main") == 0) {
1150 const ir_function_signature *sig;
1151 exec_list empty;
1152
1153 sig = ir->matching_signature(NULL, &empty, false);
1154
1155 assert(sig);
1156
1157 visit_instructions(&sig->body);
1158 }
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_mad(ir_expression *ir)
1163 {
1164 /* 3-src instructions were introduced in gen6. */
1165 if (devinfo->gen < 6)
1166 return false;
1167
1168 /* MAD can only handle floating-point data. */
1169 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1170 return false;
1171
1172 ir_rvalue *nonmul;
1173 ir_expression *mul;
1174 bool mul_negate, mul_abs;
1175
1176 for (int i = 0; i < 2; i++) {
1177 mul_negate = false;
1178 mul_abs = false;
1179
1180 mul = ir->operands[i]->as_expression();
1181 nonmul = ir->operands[1 - i];
1182
1183 if (mul && mul->operation == ir_unop_abs) {
1184 mul = mul->operands[0]->as_expression();
1185 mul_abs = true;
1186 } else if (mul && mul->operation == ir_unop_neg) {
1187 mul = mul->operands[0]->as_expression();
1188 mul_negate = true;
1189 }
1190
1191 if (mul && mul->operation == ir_binop_mul)
1192 break;
1193 }
1194
1195 if (!mul || mul->operation != ir_binop_mul)
1196 return false;
1197
1198 nonmul->accept(this);
1199 src_reg src0 = fix_3src_operand(this->result);
1200
1201 mul->operands[0]->accept(this);
1202 src_reg src1 = fix_3src_operand(this->result);
1203 src1.negate ^= mul_negate;
1204 src1.abs = mul_abs;
1205 if (mul_abs)
1206 src1.negate = false;
1207
1208 mul->operands[1]->accept(this);
1209 src_reg src2 = fix_3src_operand(this->result);
1210 src2.abs = mul_abs;
1211 if (mul_abs)
1212 src2.negate = false;
1213
1214 this->result = src_reg(this, ir->type);
1215 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1216
1217 return true;
1218 }
1219
1220 bool
1221 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1222 {
1223 /* This optimization relies on CMP setting the destination to 0 when
1224 * false. Early hardware only sets the least significant bit, and
1225 * leaves the other bits undefined. So we can't use it.
1226 */
1227 if (devinfo->gen < 6)
1228 return false;
1229
1230 ir_expression *const cmp = ir->operands[0]->as_expression();
1231
1232 if (cmp == NULL)
1233 return false;
1234
1235 switch (cmp->operation) {
1236 case ir_binop_less:
1237 case ir_binop_greater:
1238 case ir_binop_lequal:
1239 case ir_binop_gequal:
1240 case ir_binop_equal:
1241 case ir_binop_nequal:
1242 break;
1243
1244 default:
1245 return false;
1246 }
1247
1248 cmp->operands[0]->accept(this);
1249 const src_reg cmp_src0 = this->result;
1250
1251 cmp->operands[1]->accept(this);
1252 const src_reg cmp_src1 = this->result;
1253
1254 this->result = src_reg(this, ir->type);
1255
1256 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1257 brw_conditional_for_comparison(cmp->operation)));
1258
1259 /* If the comparison is false, this->result will just happen to be zero.
1260 */
1261 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1262 this->result, src_reg(1.0f));
1263 inst->predicate = BRW_PREDICATE_NORMAL;
1264 inst->predicate_inverse = true;
1265
1266 return true;
1267 }
1268
1269 vec4_instruction *
1270 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1271 src_reg src0, src_reg src1)
1272 {
1273 vec4_instruction *inst;
1274
1275 if (devinfo->gen >= 6) {
1276 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1277 inst->conditional_mod = conditionalmod;
1278 } else {
1279 emit(CMP(dst, src0, src1, conditionalmod));
1280
1281 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1282 inst->predicate = BRW_PREDICATE_NORMAL;
1283 }
1284
1285 return inst;
1286 }
1287
1288 vec4_instruction *
1289 vec4_visitor::emit_lrp(const dst_reg &dst,
1290 const src_reg &x, const src_reg &y, const src_reg &a)
1291 {
1292 if (devinfo->gen >= 6) {
1293 /* Note that the instruction's argument order is reversed from GLSL
1294 * and the IR.
1295 */
1296 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1297 fix_3src_operand(x)));
1298 } else {
1299 /* Earlier generations don't support three source operations, so we
1300 * need to emit x*(1-a) + y*a.
1301 */
1302 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1303 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1304 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1305 y_times_a.writemask = dst.writemask;
1306 one_minus_a.writemask = dst.writemask;
1307 x_times_one_minus_a.writemask = dst.writemask;
1308
1309 emit(MUL(y_times_a, y, a));
1310 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1311 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1312 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1313 }
1314 }
1315
1316 /**
1317 * Emits the instructions needed to perform a pull constant load. before_block
1318 * and before_inst can be NULL in which case the instruction will be appended
1319 * to the end of the instruction list.
1320 */
1321 void
1322 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1323 src_reg surf_index,
1324 src_reg offset_reg,
1325 bblock_t *before_block,
1326 vec4_instruction *before_inst)
1327 {
1328 assert((before_inst == NULL && before_block == NULL) ||
1329 (before_inst && before_block));
1330
1331 vec4_instruction *pull;
1332
1333 if (devinfo->gen >= 9) {
1334 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1335 src_reg header(this, glsl_type::uvec4_type, 2);
1336
1337 pull = new(mem_ctx)
1338 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1339 dst_reg(header));
1340
1341 if (before_inst)
1342 emit_before(before_block, before_inst, pull);
1343 else
1344 emit(pull);
1345
1346 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1347 offset_reg.type);
1348 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1349
1350 if (before_inst)
1351 emit_before(before_block, before_inst, pull);
1352 else
1353 emit(pull);
1354
1355 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1356 dst,
1357 surf_index,
1358 header);
1359 pull->mlen = 2;
1360 pull->header_size = 1;
1361 } else if (devinfo->gen >= 7) {
1362 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1363
1364 grf_offset.type = offset_reg.type;
1365
1366 pull = MOV(grf_offset, offset_reg);
1367
1368 if (before_inst)
1369 emit_before(before_block, before_inst, pull);
1370 else
1371 emit(pull);
1372
1373 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1374 dst,
1375 surf_index,
1376 src_reg(grf_offset));
1377 pull->mlen = 1;
1378 } else {
1379 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1380 dst,
1381 surf_index,
1382 offset_reg);
1383 pull->base_mrf = 14;
1384 pull->mlen = 1;
1385 }
1386
1387 if (before_inst)
1388 emit_before(before_block, before_inst, pull);
1389 else
1390 emit(pull);
1391 }
1392
1393 src_reg
1394 vec4_visitor::emit_uniformize(const src_reg &src)
1395 {
1396 const src_reg chan_index(this, glsl_type::uint_type);
1397 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1398 src.type);
1399
1400 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1401 ->force_writemask_all = true;
1402 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1403 ->force_writemask_all = true;
1404
1405 return src_reg(dst);
1406 }
1407
1408 void
1409 vec4_visitor::visit(ir_expression *ir)
1410 {
1411 unsigned int operand;
1412 src_reg op[ARRAY_SIZE(ir->operands)];
1413 vec4_instruction *inst;
1414
1415 if (ir->operation == ir_binop_add) {
1416 if (try_emit_mad(ir))
1417 return;
1418 }
1419
1420 if (ir->operation == ir_unop_b2f) {
1421 if (try_emit_b2f_of_compare(ir))
1422 return;
1423 }
1424
1425 /* Storage for our result. Ideally for an assignment we'd be using
1426 * the actual storage for the result here, instead.
1427 */
1428 dst_reg result_dst(this, ir->type);
1429 src_reg result_src(result_dst);
1430
1431 if (ir->operation == ir_triop_csel) {
1432 ir->operands[1]->accept(this);
1433 op[1] = this->result;
1434 ir->operands[2]->accept(this);
1435 op[2] = this->result;
1436
1437 enum brw_predicate predicate;
1438 emit_bool_to_cond_code(ir->operands[0], &predicate);
1439 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1440 inst->predicate = predicate;
1441 this->result = result_src;
1442 return;
1443 }
1444
1445 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1446 this->result.file = BAD_FILE;
1447 ir->operands[operand]->accept(this);
1448 if (this->result.file == BAD_FILE) {
1449 fprintf(stderr, "Failed to get tree for expression operand:\n");
1450 ir->operands[operand]->fprint(stderr);
1451 exit(1);
1452 }
1453 op[operand] = this->result;
1454
1455 /* Matrix expression operands should have been broken down to vector
1456 * operations already.
1457 */
1458 assert(!ir->operands[operand]->type->is_matrix());
1459 }
1460
1461 /* If nothing special happens, this is the result. */
1462 this->result = result_src;
1463
1464 switch (ir->operation) {
1465 case ir_unop_logic_not:
1466 emit(NOT(result_dst, op[0]));
1467 break;
1468 case ir_unop_neg:
1469 op[0].negate = !op[0].negate;
1470 emit(MOV(result_dst, op[0]));
1471 break;
1472 case ir_unop_abs:
1473 op[0].abs = true;
1474 op[0].negate = false;
1475 emit(MOV(result_dst, op[0]));
1476 break;
1477
1478 case ir_unop_sign:
1479 if (ir->type->is_float()) {
1480 /* AND(val, 0x80000000) gives the sign bit.
1481 *
1482 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1483 * zero.
1484 */
1485 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486
1487 op[0].type = BRW_REGISTER_TYPE_UD;
1488 result_dst.type = BRW_REGISTER_TYPE_UD;
1489 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1490
1491 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1492 inst->predicate = BRW_PREDICATE_NORMAL;
1493
1494 this->result.type = BRW_REGISTER_TYPE_F;
1495 } else {
1496 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1497 * -> non-negative val generates 0x00000000.
1498 * Predicated OR sets 1 if val is positive.
1499 */
1500 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1501
1502 emit(ASR(result_dst, op[0], src_reg(31)));
1503
1504 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1505 inst->predicate = BRW_PREDICATE_NORMAL;
1506 }
1507 break;
1508
1509 case ir_unop_rcp:
1510 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1511 break;
1512
1513 case ir_unop_exp2:
1514 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1515 break;
1516 case ir_unop_log2:
1517 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1518 break;
1519 case ir_unop_exp:
1520 case ir_unop_log:
1521 unreachable("not reached: should be handled by ir_explog_to_explog2");
1522 case ir_unop_sin:
1523 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1524 break;
1525 case ir_unop_cos:
1526 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1527 break;
1528
1529 case ir_unop_dFdx:
1530 case ir_unop_dFdx_coarse:
1531 case ir_unop_dFdx_fine:
1532 case ir_unop_dFdy:
1533 case ir_unop_dFdy_coarse:
1534 case ir_unop_dFdy_fine:
1535 unreachable("derivatives not valid in vertex shader");
1536
1537 case ir_unop_bitfield_reverse:
1538 emit(BFREV(result_dst, op[0]));
1539 break;
1540 case ir_unop_bit_count:
1541 emit(CBIT(result_dst, op[0]));
1542 break;
1543 case ir_unop_find_msb: {
1544 src_reg temp = src_reg(this, glsl_type::uint_type);
1545
1546 inst = emit(FBH(dst_reg(temp), op[0]));
1547 inst->dst.writemask = WRITEMASK_XYZW;
1548
1549 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1550 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1551 * subtract the result from 31 to convert the MSB count into an LSB count.
1552 */
1553
1554 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1555 temp.swizzle = BRW_SWIZZLE_NOOP;
1556 emit(MOV(result_dst, temp));
1557
1558 src_reg src_tmp = src_reg(result_dst);
1559 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1560
1561 src_tmp.negate = true;
1562 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1563 inst->predicate = BRW_PREDICATE_NORMAL;
1564 break;
1565 }
1566 case ir_unop_find_lsb:
1567 emit(FBL(result_dst, op[0]));
1568 break;
1569 case ir_unop_saturate:
1570 inst = emit(MOV(result_dst, op[0]));
1571 inst->saturate = true;
1572 break;
1573
1574 case ir_unop_noise:
1575 unreachable("not reached: should be handled by lower_noise");
1576
1577 case ir_unop_subroutine_to_int:
1578 emit(MOV(result_dst, op[0]));
1579 break;
1580
1581 case ir_binop_add:
1582 emit(ADD(result_dst, op[0], op[1]));
1583 break;
1584 case ir_binop_sub:
1585 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1586
1587 case ir_binop_mul:
1588 if (devinfo->gen < 8 && ir->type->is_integer()) {
1589 /* For integer multiplication, the MUL uses the low 16 bits of one of
1590 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1591 * accumulates in the contribution of the upper 16 bits of that
1592 * operand. If we can determine that one of the args is in the low
1593 * 16 bits, though, we can just emit a single MUL.
1594 */
1595 if (ir->operands[0]->is_uint16_constant()) {
1596 if (devinfo->gen < 7)
1597 emit(MUL(result_dst, op[0], op[1]));
1598 else
1599 emit(MUL(result_dst, op[1], op[0]));
1600 } else if (ir->operands[1]->is_uint16_constant()) {
1601 if (devinfo->gen < 7)
1602 emit(MUL(result_dst, op[1], op[0]));
1603 else
1604 emit(MUL(result_dst, op[0], op[1]));
1605 } else {
1606 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1607
1608 emit(MUL(acc, op[0], op[1]));
1609 emit(MACH(dst_null_d(), op[0], op[1]));
1610 emit(MOV(result_dst, src_reg(acc)));
1611 }
1612 } else {
1613 emit(MUL(result_dst, op[0], op[1]));
1614 }
1615 break;
1616 case ir_binop_imul_high: {
1617 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1618
1619 emit(MUL(acc, op[0], op[1]));
1620 emit(MACH(result_dst, op[0], op[1]));
1621 break;
1622 }
1623 case ir_binop_div:
1624 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1625 assert(ir->type->is_integer());
1626 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1627 break;
1628
1629 case ir_binop_carry:
1630 unreachable("Should have been lowered by carry_to_arith().");
1631
1632 case ir_binop_borrow:
1633 unreachable("Should have been lowered by borrow_to_arith().");
1634
1635 case ir_binop_mod:
1636 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1637 assert(ir->type->is_integer());
1638 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1639 break;
1640
1641 case ir_binop_less:
1642 case ir_binop_greater:
1643 case ir_binop_lequal:
1644 case ir_binop_gequal:
1645 case ir_binop_equal:
1646 case ir_binop_nequal: {
1647 if (devinfo->gen <= 5) {
1648 resolve_bool_comparison(ir->operands[0], &op[0]);
1649 resolve_bool_comparison(ir->operands[1], &op[1]);
1650 }
1651 emit(CMP(result_dst, op[0], op[1],
1652 brw_conditional_for_comparison(ir->operation)));
1653 break;
1654 }
1655
1656 case ir_binop_all_equal:
1657 if (devinfo->gen <= 5) {
1658 resolve_bool_comparison(ir->operands[0], &op[0]);
1659 resolve_bool_comparison(ir->operands[1], &op[1]);
1660 }
1661
1662 /* "==" operator producing a scalar boolean. */
1663 if (ir->operands[0]->type->is_vector() ||
1664 ir->operands[1]->type->is_vector()) {
1665 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1666 emit(MOV(result_dst, src_reg(0)));
1667 inst = emit(MOV(result_dst, src_reg(~0)));
1668 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1669 } else {
1670 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1671 }
1672 break;
1673 case ir_binop_any_nequal:
1674 if (devinfo->gen <= 5) {
1675 resolve_bool_comparison(ir->operands[0], &op[0]);
1676 resolve_bool_comparison(ir->operands[1], &op[1]);
1677 }
1678
1679 /* "!=" operator producing a scalar boolean. */
1680 if (ir->operands[0]->type->is_vector() ||
1681 ir->operands[1]->type->is_vector()) {
1682 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1683
1684 emit(MOV(result_dst, src_reg(0)));
1685 inst = emit(MOV(result_dst, src_reg(~0)));
1686 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1687 } else {
1688 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1689 }
1690 break;
1691
1692 case ir_unop_any:
1693 if (devinfo->gen <= 5) {
1694 resolve_bool_comparison(ir->operands[0], &op[0]);
1695 }
1696 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1697 emit(MOV(result_dst, src_reg(0)));
1698
1699 inst = emit(MOV(result_dst, src_reg(~0)));
1700 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1701 break;
1702
1703 case ir_binop_logic_xor:
1704 emit(XOR(result_dst, op[0], op[1]));
1705 break;
1706
1707 case ir_binop_logic_or:
1708 emit(OR(result_dst, op[0], op[1]));
1709 break;
1710
1711 case ir_binop_logic_and:
1712 emit(AND(result_dst, op[0], op[1]));
1713 break;
1714
1715 case ir_binop_dot:
1716 assert(ir->operands[0]->type->is_vector());
1717 assert(ir->operands[0]->type == ir->operands[1]->type);
1718 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1719 break;
1720
1721 case ir_unop_sqrt:
1722 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1723 break;
1724 case ir_unop_rsq:
1725 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1726 break;
1727
1728 case ir_unop_bitcast_i2f:
1729 case ir_unop_bitcast_u2f:
1730 this->result = op[0];
1731 this->result.type = BRW_REGISTER_TYPE_F;
1732 break;
1733
1734 case ir_unop_bitcast_f2i:
1735 this->result = op[0];
1736 this->result.type = BRW_REGISTER_TYPE_D;
1737 break;
1738
1739 case ir_unop_bitcast_f2u:
1740 this->result = op[0];
1741 this->result.type = BRW_REGISTER_TYPE_UD;
1742 break;
1743
1744 case ir_unop_i2f:
1745 case ir_unop_i2u:
1746 case ir_unop_u2i:
1747 case ir_unop_u2f:
1748 case ir_unop_f2i:
1749 case ir_unop_f2u:
1750 emit(MOV(result_dst, op[0]));
1751 break;
1752 case ir_unop_b2i:
1753 case ir_unop_b2f:
1754 if (devinfo->gen <= 5) {
1755 resolve_bool_comparison(ir->operands[0], &op[0]);
1756 }
1757 emit(MOV(result_dst, negate(op[0])));
1758 break;
1759 case ir_unop_f2b:
1760 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1761 break;
1762 case ir_unop_i2b:
1763 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1764 break;
1765
1766 case ir_unop_trunc:
1767 emit(RNDZ(result_dst, op[0]));
1768 break;
1769 case ir_unop_ceil: {
1770 src_reg tmp = src_reg(this, ir->type);
1771 op[0].negate = !op[0].negate;
1772 emit(RNDD(dst_reg(tmp), op[0]));
1773 tmp.negate = true;
1774 emit(MOV(result_dst, tmp));
1775 }
1776 break;
1777 case ir_unop_floor:
1778 inst = emit(RNDD(result_dst, op[0]));
1779 break;
1780 case ir_unop_fract:
1781 inst = emit(FRC(result_dst, op[0]));
1782 break;
1783 case ir_unop_round_even:
1784 emit(RNDE(result_dst, op[0]));
1785 break;
1786
1787 case ir_binop_min:
1788 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1789 break;
1790 case ir_binop_max:
1791 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1792 break;
1793
1794 case ir_binop_pow:
1795 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1796 break;
1797
1798 case ir_unop_bit_not:
1799 inst = emit(NOT(result_dst, op[0]));
1800 break;
1801 case ir_binop_bit_and:
1802 inst = emit(AND(result_dst, op[0], op[1]));
1803 break;
1804 case ir_binop_bit_xor:
1805 inst = emit(XOR(result_dst, op[0], op[1]));
1806 break;
1807 case ir_binop_bit_or:
1808 inst = emit(OR(result_dst, op[0], op[1]));
1809 break;
1810
1811 case ir_binop_lshift:
1812 inst = emit(SHL(result_dst, op[0], op[1]));
1813 break;
1814
1815 case ir_binop_rshift:
1816 if (ir->type->base_type == GLSL_TYPE_INT)
1817 inst = emit(ASR(result_dst, op[0], op[1]));
1818 else
1819 inst = emit(SHR(result_dst, op[0], op[1]));
1820 break;
1821
1822 case ir_binop_bfm:
1823 emit(BFI1(result_dst, op[0], op[1]));
1824 break;
1825
1826 case ir_binop_ubo_load: {
1827 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1828 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1829 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1830 src_reg offset;
1831
1832 /* Now, load the vector from that offset. */
1833 assert(ir->type->is_vector() || ir->type->is_scalar());
1834
1835 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1836 packed_consts.type = result.type;
1837 src_reg surf_index;
1838
1839 if (const_uniform_block) {
1840 /* The block index is a constant, so just emit the binding table entry
1841 * as an immediate.
1842 */
1843 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1844 const_uniform_block->value.u[0]);
1845 } else {
1846 /* The block index is not a constant. Evaluate the index expression
1847 * per-channel and add the base UBO index; we have to select a value
1848 * from any live channel.
1849 */
1850 surf_index = src_reg(this, glsl_type::uint_type);
1851 emit(ADD(dst_reg(surf_index), op[0],
1852 src_reg(prog_data->base.binding_table.ubo_start)));
1853 surf_index = emit_uniformize(surf_index);
1854
1855 /* Assume this may touch any UBO. It would be nice to provide
1856 * a tighter bound, but the array information is already lowered away.
1857 */
1858 brw_mark_surface_used(&prog_data->base,
1859 prog_data->base.binding_table.ubo_start +
1860 shader_prog->NumUniformBlocks - 1);
1861 }
1862
1863 if (const_offset_ir) {
1864 if (devinfo->gen >= 8) {
1865 /* Store the offset in a GRF so we can send-from-GRF. */
1866 offset = src_reg(this, glsl_type::int_type);
1867 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1868 } else {
1869 /* Immediates are fine on older generations since they'll be moved
1870 * to a (potentially fake) MRF at the generator level.
1871 */
1872 offset = src_reg(const_offset / 16);
1873 }
1874 } else {
1875 offset = src_reg(this, glsl_type::uint_type);
1876 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1877 }
1878
1879 emit_pull_constant_load_reg(dst_reg(packed_consts),
1880 surf_index,
1881 offset,
1882 NULL, NULL /* before_block/inst */);
1883
1884 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1885 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1886 const_offset % 16 / 4,
1887 const_offset % 16 / 4,
1888 const_offset % 16 / 4);
1889
1890 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1891 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1892 emit(CMP(result_dst, packed_consts, src_reg(0u),
1893 BRW_CONDITIONAL_NZ));
1894 } else {
1895 emit(MOV(result_dst, packed_consts));
1896 }
1897 break;
1898 }
1899
1900 case ir_binop_vector_extract:
1901 unreachable("should have been lowered by vec_index_to_cond_assign");
1902
1903 case ir_triop_fma:
1904 op[0] = fix_3src_operand(op[0]);
1905 op[1] = fix_3src_operand(op[1]);
1906 op[2] = fix_3src_operand(op[2]);
1907 /* Note that the instruction's argument order is reversed from GLSL
1908 * and the IR.
1909 */
1910 emit(MAD(result_dst, op[2], op[1], op[0]));
1911 break;
1912
1913 case ir_triop_lrp:
1914 emit_lrp(result_dst, op[0], op[1], op[2]);
1915 break;
1916
1917 case ir_triop_csel:
1918 unreachable("already handled above");
1919 break;
1920
1921 case ir_triop_bfi:
1922 op[0] = fix_3src_operand(op[0]);
1923 op[1] = fix_3src_operand(op[1]);
1924 op[2] = fix_3src_operand(op[2]);
1925 emit(BFI2(result_dst, op[0], op[1], op[2]));
1926 break;
1927
1928 case ir_triop_bitfield_extract:
1929 op[0] = fix_3src_operand(op[0]);
1930 op[1] = fix_3src_operand(op[1]);
1931 op[2] = fix_3src_operand(op[2]);
1932 /* Note that the instruction's argument order is reversed from GLSL
1933 * and the IR.
1934 */
1935 emit(BFE(result_dst, op[2], op[1], op[0]));
1936 break;
1937
1938 case ir_triop_vector_insert:
1939 unreachable("should have been lowered by lower_vector_insert");
1940
1941 case ir_quadop_bitfield_insert:
1942 unreachable("not reached: should be handled by "
1943 "bitfield_insert_to_bfm_bfi\n");
1944
1945 case ir_quadop_vector:
1946 unreachable("not reached: should be handled by lower_quadop_vector");
1947
1948 case ir_unop_pack_half_2x16:
1949 emit_pack_half_2x16(result_dst, op[0]);
1950 break;
1951 case ir_unop_unpack_half_2x16:
1952 emit_unpack_half_2x16(result_dst, op[0]);
1953 break;
1954 case ir_unop_unpack_unorm_4x8:
1955 emit_unpack_unorm_4x8(result_dst, op[0]);
1956 break;
1957 case ir_unop_unpack_snorm_4x8:
1958 emit_unpack_snorm_4x8(result_dst, op[0]);
1959 break;
1960 case ir_unop_pack_unorm_4x8:
1961 emit_pack_unorm_4x8(result_dst, op[0]);
1962 break;
1963 case ir_unop_pack_snorm_4x8:
1964 emit_pack_snorm_4x8(result_dst, op[0]);
1965 break;
1966 case ir_unop_pack_snorm_2x16:
1967 case ir_unop_pack_unorm_2x16:
1968 case ir_unop_unpack_snorm_2x16:
1969 case ir_unop_unpack_unorm_2x16:
1970 unreachable("not reached: should be handled by lower_packing_builtins");
1971 case ir_unop_unpack_half_2x16_split_x:
1972 case ir_unop_unpack_half_2x16_split_y:
1973 case ir_binop_pack_half_2x16_split:
1974 case ir_unop_interpolate_at_centroid:
1975 case ir_binop_interpolate_at_sample:
1976 case ir_binop_interpolate_at_offset:
1977 unreachable("not reached: should not occur in vertex shader");
1978 case ir_binop_ldexp:
1979 unreachable("not reached: should be handled by ldexp_to_arith()");
1980 case ir_unop_d2f:
1981 case ir_unop_f2d:
1982 case ir_unop_d2i:
1983 case ir_unop_i2d:
1984 case ir_unop_d2u:
1985 case ir_unop_u2d:
1986 case ir_unop_d2b:
1987 case ir_unop_pack_double_2x32:
1988 case ir_unop_unpack_double_2x32:
1989 case ir_unop_frexp_sig:
1990 case ir_unop_frexp_exp:
1991 unreachable("fp64 todo");
1992 }
1993 }
1994
1995
1996 void
1997 vec4_visitor::visit(ir_swizzle *ir)
1998 {
1999 /* Note that this is only swizzles in expressions, not those on the left
2000 * hand side of an assignment, which do write masking. See ir_assignment
2001 * for that.
2002 */
2003 const unsigned swz = brw_compose_swizzle(
2004 brw_swizzle_for_size(ir->type->vector_elements),
2005 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2006
2007 ir->val->accept(this);
2008 this->result = swizzle(this->result, swz);
2009 }
2010
2011 void
2012 vec4_visitor::visit(ir_dereference_variable *ir)
2013 {
2014 const struct glsl_type *type = ir->type;
2015 dst_reg *reg = variable_storage(ir->var);
2016
2017 if (!reg) {
2018 fail("Failed to find variable storage for %s\n", ir->var->name);
2019 this->result = src_reg(brw_null_reg());
2020 return;
2021 }
2022
2023 this->result = src_reg(*reg);
2024
2025 /* System values get their swizzle from the dst_reg writemask */
2026 if (ir->var->data.mode == ir_var_system_value)
2027 return;
2028
2029 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2030 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2031 }
2032
2033
2034 int
2035 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2036 {
2037 /* Under normal circumstances array elements are stored consecutively, so
2038 * the stride is equal to the size of the array element.
2039 */
2040 return type_size(ir->type);
2041 }
2042
2043
2044 void
2045 vec4_visitor::visit(ir_dereference_array *ir)
2046 {
2047 ir_constant *constant_index;
2048 src_reg src;
2049 int array_stride = compute_array_stride(ir);
2050
2051 constant_index = ir->array_index->constant_expression_value();
2052
2053 ir->array->accept(this);
2054 src = this->result;
2055
2056 if (constant_index) {
2057 src.reg_offset += constant_index->value.i[0] * array_stride;
2058 } else {
2059 /* Variable index array dereference. It eats the "vec4" of the
2060 * base of the array and an index that offsets the Mesa register
2061 * index.
2062 */
2063 ir->array_index->accept(this);
2064
2065 src_reg index_reg;
2066
2067 if (array_stride == 1) {
2068 index_reg = this->result;
2069 } else {
2070 index_reg = src_reg(this, glsl_type::int_type);
2071
2072 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2073 }
2074
2075 if (src.reladdr) {
2076 src_reg temp = src_reg(this, glsl_type::int_type);
2077
2078 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2079
2080 index_reg = temp;
2081 }
2082
2083 src.reladdr = ralloc(mem_ctx, src_reg);
2084 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2085 }
2086
2087 /* If the type is smaller than a vec4, replicate the last channel out. */
2088 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2089 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2090 else
2091 src.swizzle = BRW_SWIZZLE_NOOP;
2092 src.type = brw_type_for_base_type(ir->type);
2093
2094 this->result = src;
2095 }
2096
2097 void
2098 vec4_visitor::visit(ir_dereference_record *ir)
2099 {
2100 unsigned int i;
2101 const glsl_type *struct_type = ir->record->type;
2102 int offset = 0;
2103
2104 ir->record->accept(this);
2105
2106 for (i = 0; i < struct_type->length; i++) {
2107 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2108 break;
2109 offset += type_size(struct_type->fields.structure[i].type);
2110 }
2111
2112 /* If the type is smaller than a vec4, replicate the last channel out. */
2113 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2114 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2115 else
2116 this->result.swizzle = BRW_SWIZZLE_NOOP;
2117 this->result.type = brw_type_for_base_type(ir->type);
2118
2119 this->result.reg_offset += offset;
2120 }
2121
2122 /**
2123 * We want to be careful in assignment setup to hit the actual storage
2124 * instead of potentially using a temporary like we might with the
2125 * ir_dereference handler.
2126 */
2127 static dst_reg
2128 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2129 {
2130 /* The LHS must be a dereference. If the LHS is a variable indexed array
2131 * access of a vector, it must be separated into a series conditional moves
2132 * before reaching this point (see ir_vec_index_to_cond_assign).
2133 */
2134 assert(ir->as_dereference());
2135 ir_dereference_array *deref_array = ir->as_dereference_array();
2136 if (deref_array) {
2137 assert(!deref_array->array->type->is_vector());
2138 }
2139
2140 /* Use the rvalue deref handler for the most part. We'll ignore
2141 * swizzles in it and write swizzles using writemask, though.
2142 */
2143 ir->accept(v);
2144 return dst_reg(v->result);
2145 }
2146
2147 void
2148 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2149 const struct glsl_type *type,
2150 enum brw_predicate predicate)
2151 {
2152 if (type->base_type == GLSL_TYPE_STRUCT) {
2153 for (unsigned int i = 0; i < type->length; i++) {
2154 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2155 }
2156 return;
2157 }
2158
2159 if (type->is_array()) {
2160 for (unsigned int i = 0; i < type->length; i++) {
2161 emit_block_move(dst, src, type->fields.array, predicate);
2162 }
2163 return;
2164 }
2165
2166 if (type->is_matrix()) {
2167 const struct glsl_type *vec_type;
2168
2169 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2170 type->vector_elements, 1);
2171
2172 for (int i = 0; i < type->matrix_columns; i++) {
2173 emit_block_move(dst, src, vec_type, predicate);
2174 }
2175 return;
2176 }
2177
2178 assert(type->is_scalar() || type->is_vector());
2179
2180 dst->type = brw_type_for_base_type(type);
2181 src->type = dst->type;
2182
2183 dst->writemask = (1 << type->vector_elements) - 1;
2184
2185 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2186
2187 vec4_instruction *inst = emit(MOV(*dst, *src));
2188 inst->predicate = predicate;
2189
2190 dst->reg_offset++;
2191 src->reg_offset++;
2192 }
2193
2194
2195 /* If the RHS processing resulted in an instruction generating a
2196 * temporary value, and it would be easy to rewrite the instruction to
2197 * generate its result right into the LHS instead, do so. This ends
2198 * up reliably removing instructions where it can be tricky to do so
2199 * later without real UD chain information.
2200 */
2201 bool
2202 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2203 dst_reg dst,
2204 src_reg src,
2205 vec4_instruction *pre_rhs_inst,
2206 vec4_instruction *last_rhs_inst)
2207 {
2208 /* This could be supported, but it would take more smarts. */
2209 if (ir->condition)
2210 return false;
2211
2212 if (pre_rhs_inst == last_rhs_inst)
2213 return false; /* No instructions generated to work with. */
2214
2215 /* Make sure the last instruction generated our source reg. */
2216 if (src.file != GRF ||
2217 src.file != last_rhs_inst->dst.file ||
2218 src.reg != last_rhs_inst->dst.reg ||
2219 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2220 src.reladdr ||
2221 src.abs ||
2222 src.negate ||
2223 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2224 return false;
2225
2226 /* Check that that last instruction fully initialized the channels
2227 * we want to use, in the order we want to use them. We could
2228 * potentially reswizzle the operands of many instructions so that
2229 * we could handle out of order channels, but don't yet.
2230 */
2231
2232 for (unsigned i = 0; i < 4; i++) {
2233 if (dst.writemask & (1 << i)) {
2234 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2235 return false;
2236
2237 if (BRW_GET_SWZ(src.swizzle, i) != i)
2238 return false;
2239 }
2240 }
2241
2242 /* Success! Rewrite the instruction. */
2243 last_rhs_inst->dst.file = dst.file;
2244 last_rhs_inst->dst.reg = dst.reg;
2245 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2246 last_rhs_inst->dst.reladdr = dst.reladdr;
2247 last_rhs_inst->dst.writemask &= dst.writemask;
2248
2249 return true;
2250 }
2251
2252 void
2253 vec4_visitor::visit(ir_assignment *ir)
2254 {
2255 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2256 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2257
2258 if (!ir->lhs->type->is_scalar() &&
2259 !ir->lhs->type->is_vector()) {
2260 ir->rhs->accept(this);
2261 src_reg src = this->result;
2262
2263 if (ir->condition) {
2264 emit_bool_to_cond_code(ir->condition, &predicate);
2265 }
2266
2267 /* emit_block_move doesn't account for swizzles in the source register.
2268 * This should be ok, since the source register is a structure or an
2269 * array, and those can't be swizzled. But double-check to be sure.
2270 */
2271 assert(src.swizzle ==
2272 (ir->rhs->type->is_matrix()
2273 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2274 : BRW_SWIZZLE_NOOP));
2275
2276 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2277 return;
2278 }
2279
2280 /* Now we're down to just a scalar/vector with writemasks. */
2281 int i;
2282
2283 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2284 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2285
2286 ir->rhs->accept(this);
2287
2288 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2289
2290 int swizzles[4];
2291 int src_chan = 0;
2292
2293 assert(ir->lhs->type->is_vector() ||
2294 ir->lhs->type->is_scalar());
2295 dst.writemask = ir->write_mask;
2296
2297 /* Swizzle a small RHS vector into the channels being written.
2298 *
2299 * glsl ir treats write_mask as dictating how many channels are
2300 * present on the RHS while in our instructions we need to make
2301 * those channels appear in the slots of the vec4 they're written to.
2302 */
2303 for (int i = 0; i < 4; i++)
2304 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2305
2306 src_reg src = swizzle(this->result,
2307 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2308 swizzles[2], swizzles[3]));
2309
2310 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2311 return;
2312 }
2313
2314 if (ir->condition) {
2315 emit_bool_to_cond_code(ir->condition, &predicate);
2316 }
2317
2318 for (i = 0; i < type_size(ir->lhs->type); i++) {
2319 vec4_instruction *inst = emit(MOV(dst, src));
2320 inst->predicate = predicate;
2321
2322 dst.reg_offset++;
2323 src.reg_offset++;
2324 }
2325 }
2326
2327 void
2328 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2329 {
2330 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2331 foreach_in_list(ir_constant, field_value, &ir->components) {
2332 emit_constant_values(dst, field_value);
2333 }
2334 return;
2335 }
2336
2337 if (ir->type->is_array()) {
2338 for (unsigned int i = 0; i < ir->type->length; i++) {
2339 emit_constant_values(dst, ir->array_elements[i]);
2340 }
2341 return;
2342 }
2343
2344 if (ir->type->is_matrix()) {
2345 for (int i = 0; i < ir->type->matrix_columns; i++) {
2346 float *vec = &ir->value.f[i * ir->type->vector_elements];
2347
2348 for (int j = 0; j < ir->type->vector_elements; j++) {
2349 dst->writemask = 1 << j;
2350 dst->type = BRW_REGISTER_TYPE_F;
2351
2352 emit(MOV(*dst, src_reg(vec[j])));
2353 }
2354 dst->reg_offset++;
2355 }
2356 return;
2357 }
2358
2359 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2360
2361 for (int i = 0; i < ir->type->vector_elements; i++) {
2362 if (!(remaining_writemask & (1 << i)))
2363 continue;
2364
2365 dst->writemask = 1 << i;
2366 dst->type = brw_type_for_base_type(ir->type);
2367
2368 /* Find other components that match the one we're about to
2369 * write. Emits fewer instructions for things like vec4(0.5,
2370 * 1.5, 1.5, 1.5).
2371 */
2372 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2373 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2374 if (ir->value.b[i] == ir->value.b[j])
2375 dst->writemask |= (1 << j);
2376 } else {
2377 /* u, i, and f storage all line up, so no need for a
2378 * switch case for comparing each type.
2379 */
2380 if (ir->value.u[i] == ir->value.u[j])
2381 dst->writemask |= (1 << j);
2382 }
2383 }
2384
2385 switch (ir->type->base_type) {
2386 case GLSL_TYPE_FLOAT:
2387 emit(MOV(*dst, src_reg(ir->value.f[i])));
2388 break;
2389 case GLSL_TYPE_INT:
2390 emit(MOV(*dst, src_reg(ir->value.i[i])));
2391 break;
2392 case GLSL_TYPE_UINT:
2393 emit(MOV(*dst, src_reg(ir->value.u[i])));
2394 break;
2395 case GLSL_TYPE_BOOL:
2396 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2397 break;
2398 default:
2399 unreachable("Non-float/uint/int/bool constant");
2400 }
2401
2402 remaining_writemask &= ~dst->writemask;
2403 }
2404 dst->reg_offset++;
2405 }
2406
2407 void
2408 vec4_visitor::visit(ir_constant *ir)
2409 {
2410 dst_reg dst = dst_reg(this, ir->type);
2411 this->result = src_reg(dst);
2412
2413 emit_constant_values(&dst, ir);
2414 }
2415
2416 void
2417 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2418 {
2419 ir_dereference *deref = static_cast<ir_dereference *>(
2420 ir->actual_parameters.get_head());
2421 ir_variable *location = deref->variable_referenced();
2422 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2423 location->data.binding);
2424
2425 /* Calculate the surface offset */
2426 src_reg offset(this, glsl_type::uint_type);
2427 ir_dereference_array *deref_array = deref->as_dereference_array();
2428 if (deref_array) {
2429 deref_array->array_index->accept(this);
2430
2431 src_reg tmp(this, glsl_type::uint_type);
2432 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2433 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2434 } else {
2435 offset = location->data.atomic.offset;
2436 }
2437
2438 /* Emit the appropriate machine instruction */
2439 const char *callee = ir->callee->function_name();
2440 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2441
2442 if (!strcmp("__intrinsic_atomic_read", callee)) {
2443 emit_untyped_surface_read(surf_index, dst, offset);
2444
2445 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2446 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2447 src_reg(), src_reg());
2448
2449 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2450 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2451 src_reg(), src_reg());
2452 }
2453
2454 brw_mark_surface_used(stage_prog_data, surf_index);
2455 }
2456
2457 void
2458 vec4_visitor::visit(ir_call *ir)
2459 {
2460 const char *callee = ir->callee->function_name();
2461
2462 if (!strcmp("__intrinsic_atomic_read", callee) ||
2463 !strcmp("__intrinsic_atomic_increment", callee) ||
2464 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2465 visit_atomic_counter_intrinsic(ir);
2466 } else {
2467 unreachable("Unsupported intrinsic.");
2468 }
2469 }
2470
2471 src_reg
2472 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2473 src_reg coordinate, src_reg sampler)
2474 {
2475 vec4_instruction *inst =
2476 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2477 dst_reg(this, glsl_type::uvec4_type));
2478 inst->base_mrf = 2;
2479 inst->src[1] = sampler;
2480
2481 int param_base;
2482
2483 if (devinfo->gen >= 9) {
2484 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2485 vec4_instruction *header_inst = new(mem_ctx)
2486 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2487 dst_reg(MRF, inst->base_mrf));
2488
2489 emit(header_inst);
2490
2491 inst->mlen = 2;
2492 inst->header_size = 1;
2493 param_base = inst->base_mrf + 1;
2494 } else {
2495 inst->mlen = 1;
2496 param_base = inst->base_mrf;
2497 }
2498
2499 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2500 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2501 int zero_mask = 0xf & ~coord_mask;
2502
2503 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2504 coordinate));
2505
2506 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2507 src_reg(0)));
2508
2509 emit(inst);
2510 return src_reg(inst->dst);
2511 }
2512
2513 bool
2514 vec4_visitor::is_high_sampler(src_reg sampler)
2515 {
2516 if (devinfo->gen < 8 && !devinfo->is_haswell)
2517 return false;
2518
2519 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2520 }
2521
2522 void
2523 vec4_visitor::visit(ir_texture *ir)
2524 {
2525 uint32_t sampler =
2526 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2527
2528 ir_rvalue *nonconst_sampler_index =
2529 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2530
2531 /* Handle non-constant sampler array indexing */
2532 src_reg sampler_reg;
2533 if (nonconst_sampler_index) {
2534 /* The highest sampler which may be used by this operation is
2535 * the last element of the array. Mark it here, because the generator
2536 * doesn't have enough information to determine the bound.
2537 */
2538 uint32_t array_size = ir->sampler->as_dereference_array()
2539 ->array->type->array_size();
2540
2541 uint32_t max_used = sampler + array_size - 1;
2542 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2543 max_used += prog_data->base.binding_table.gather_texture_start;
2544 } else {
2545 max_used += prog_data->base.binding_table.texture_start;
2546 }
2547
2548 brw_mark_surface_used(&prog_data->base, max_used);
2549
2550 /* Emit code to evaluate the actual indexing expression */
2551 nonconst_sampler_index->accept(this);
2552 src_reg temp(this, glsl_type::uint_type);
2553 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2554 sampler_reg = emit_uniformize(temp);
2555 } else {
2556 /* Single sampler, or constant array index; the indexing expression
2557 * is just an immediate.
2558 */
2559 sampler_reg = src_reg(sampler);
2560 }
2561
2562 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2563 * emitting anything other than setting up the constant result.
2564 */
2565 if (ir->op == ir_tg4) {
2566 ir_constant *chan = ir->lod_info.component->as_constant();
2567 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2568 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2569 dst_reg result(this, ir->type);
2570 this->result = src_reg(result);
2571 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2572 return;
2573 }
2574 }
2575
2576 /* Should be lowered by do_lower_texture_projection */
2577 assert(!ir->projector);
2578
2579 /* Should be lowered */
2580 assert(!ir->offset || !ir->offset->type->is_array());
2581
2582 /* Generate code to compute all the subexpression trees. This has to be
2583 * done before loading any values into MRFs for the sampler message since
2584 * generating these values may involve SEND messages that need the MRFs.
2585 */
2586 src_reg coordinate;
2587 if (ir->coordinate) {
2588 ir->coordinate->accept(this);
2589 coordinate = this->result;
2590 }
2591
2592 src_reg shadow_comparitor;
2593 if (ir->shadow_comparitor) {
2594 ir->shadow_comparitor->accept(this);
2595 shadow_comparitor = this->result;
2596 }
2597
2598 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2599 src_reg offset_value;
2600 if (has_nonconstant_offset) {
2601 ir->offset->accept(this);
2602 offset_value = src_reg(this->result);
2603 }
2604
2605 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2606 src_reg lod, dPdx, dPdy, sample_index, mcs;
2607 switch (ir->op) {
2608 case ir_tex:
2609 lod = src_reg(0.0f);
2610 lod_type = glsl_type::float_type;
2611 break;
2612 case ir_txf:
2613 case ir_txl:
2614 case ir_txs:
2615 ir->lod_info.lod->accept(this);
2616 lod = this->result;
2617 lod_type = ir->lod_info.lod->type;
2618 break;
2619 case ir_query_levels:
2620 lod = src_reg(0);
2621 lod_type = glsl_type::int_type;
2622 break;
2623 case ir_txf_ms:
2624 ir->lod_info.sample_index->accept(this);
2625 sample_index = this->result;
2626 sample_index_type = ir->lod_info.sample_index->type;
2627
2628 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2629 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2630 else
2631 mcs = src_reg(0u);
2632 break;
2633 case ir_txd:
2634 ir->lod_info.grad.dPdx->accept(this);
2635 dPdx = this->result;
2636
2637 ir->lod_info.grad.dPdy->accept(this);
2638 dPdy = this->result;
2639
2640 lod_type = ir->lod_info.grad.dPdx->type;
2641 break;
2642 case ir_txb:
2643 case ir_lod:
2644 case ir_tg4:
2645 break;
2646 }
2647
2648 enum opcode opcode;
2649 switch (ir->op) {
2650 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2651 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2652 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2653 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2654 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2655 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2656 case ir_tg4: opcode = has_nonconstant_offset
2657 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2658 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2659 case ir_txb:
2660 unreachable("TXB is not valid for vertex shaders.");
2661 case ir_lod:
2662 unreachable("LOD is not valid for vertex shaders.");
2663 default:
2664 unreachable("Unrecognized tex op");
2665 }
2666
2667 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2668 opcode, dst_reg(this, ir->type));
2669
2670 if (ir->offset != NULL && !has_nonconstant_offset) {
2671 inst->offset =
2672 brw_texture_offset(ir->offset->as_constant()->value.i,
2673 ir->offset->type->vector_elements);
2674 }
2675
2676 /* Stuff the channel select bits in the top of the texture offset */
2677 if (ir->op == ir_tg4)
2678 inst->offset |=
2679 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2680 sampler) << 16;
2681
2682 /* The message header is necessary for:
2683 * - Gen4 (always)
2684 * - Gen9+ for selecting SIMD4x2
2685 * - Texel offsets
2686 * - Gather channel selection
2687 * - Sampler indices too large to fit in a 4-bit value.
2688 */
2689 inst->header_size =
2690 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2691 inst->offset != 0 || ir->op == ir_tg4 ||
2692 is_high_sampler(sampler_reg)) ? 1 : 0;
2693 inst->base_mrf = 2;
2694 inst->mlen = inst->header_size + 1; /* always at least one */
2695 inst->dst.writemask = WRITEMASK_XYZW;
2696 inst->shadow_compare = ir->shadow_comparitor != NULL;
2697
2698 inst->src[1] = sampler_reg;
2699
2700 /* MRF for the first parameter */
2701 int param_base = inst->base_mrf + inst->header_size;
2702
2703 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2704 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2705 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2706 } else {
2707 /* Load the coordinate */
2708 /* FINISHME: gl_clamp_mask and saturate */
2709 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2710 int zero_mask = 0xf & ~coord_mask;
2711
2712 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2713 coordinate));
2714
2715 if (zero_mask != 0) {
2716 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2717 src_reg(0)));
2718 }
2719 /* Load the shadow comparitor */
2720 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2721 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2722 WRITEMASK_X),
2723 shadow_comparitor));
2724 inst->mlen++;
2725 }
2726
2727 /* Load the LOD info */
2728 if (ir->op == ir_tex || ir->op == ir_txl) {
2729 int mrf, writemask;
2730 if (devinfo->gen >= 5) {
2731 mrf = param_base + 1;
2732 if (ir->shadow_comparitor) {
2733 writemask = WRITEMASK_Y;
2734 /* mlen already incremented */
2735 } else {
2736 writemask = WRITEMASK_X;
2737 inst->mlen++;
2738 }
2739 } else /* devinfo->gen == 4 */ {
2740 mrf = param_base;
2741 writemask = WRITEMASK_W;
2742 }
2743 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2744 } else if (ir->op == ir_txf) {
2745 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2746 } else if (ir->op == ir_txf_ms) {
2747 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2748 sample_index));
2749 if (devinfo->gen >= 7) {
2750 /* MCS data is in the first channel of `mcs`, but we need to get it into
2751 * the .y channel of the second vec4 of params, so replicate .x across
2752 * the whole vec4 and then mask off everything except .y
2753 */
2754 mcs.swizzle = BRW_SWIZZLE_XXXX;
2755 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2756 mcs));
2757 }
2758 inst->mlen++;
2759 } else if (ir->op == ir_txd) {
2760 const glsl_type *type = lod_type;
2761
2762 if (devinfo->gen >= 5) {
2763 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2764 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2765 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2766 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2767 inst->mlen++;
2768
2769 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2770 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2771 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2772 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2773 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2774 inst->mlen++;
2775
2776 if (ir->shadow_comparitor) {
2777 emit(MOV(dst_reg(MRF, param_base + 2,
2778 ir->shadow_comparitor->type, WRITEMASK_Z),
2779 shadow_comparitor));
2780 }
2781 }
2782 } else /* devinfo->gen == 4 */ {
2783 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2784 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2785 inst->mlen += 2;
2786 }
2787 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2788 if (ir->shadow_comparitor) {
2789 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2790 shadow_comparitor));
2791 }
2792
2793 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2794 offset_value));
2795 inst->mlen++;
2796 }
2797 }
2798
2799 emit(inst);
2800
2801 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2802 * spec requires layers.
2803 */
2804 if (ir->op == ir_txs) {
2805 glsl_type const *type = ir->sampler->type;
2806 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2807 type->sampler_array) {
2808 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2809 writemask(inst->dst, WRITEMASK_Z),
2810 src_reg(inst->dst), src_reg(6));
2811 }
2812 }
2813
2814 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2815 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2816 }
2817
2818 swizzle_result(ir, src_reg(inst->dst), sampler);
2819 }
2820
2821 /**
2822 * Apply workarounds for Gen6 gather with UINT/SINT
2823 */
2824 void
2825 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2826 {
2827 if (!wa)
2828 return;
2829
2830 int width = (wa & WA_8BIT) ? 8 : 16;
2831 dst_reg dst_f = dst;
2832 dst_f.type = BRW_REGISTER_TYPE_F;
2833
2834 /* Convert from UNORM to UINT */
2835 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2836 emit(MOV(dst, src_reg(dst_f)));
2837
2838 if (wa & WA_SIGN) {
2839 /* Reinterpret the UINT value as a signed INT value by
2840 * shifting the sign bit into place, then shifting back
2841 * preserving sign.
2842 */
2843 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2844 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2845 }
2846 }
2847
2848 /**
2849 * Set up the gather channel based on the swizzle, for gather4.
2850 */
2851 uint32_t
2852 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2853 {
2854 int swiz = GET_SWZ(key->tex.swizzles[sampler], gather_component);
2855 switch (swiz) {
2856 case SWIZZLE_X: return 0;
2857 case SWIZZLE_Y:
2858 /* gather4 sampler is broken for green channel on RG32F --
2859 * we must ask for blue instead.
2860 */
2861 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2862 return 2;
2863 return 1;
2864 case SWIZZLE_Z: return 2;
2865 case SWIZZLE_W: return 3;
2866 default:
2867 unreachable("Not reached"); /* zero, one swizzles handled already */
2868 }
2869 }
2870
2871 void
2872 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2873 {
2874 int s = key->tex.swizzles[sampler];
2875
2876 this->result = src_reg(this, ir->type);
2877 dst_reg swizzled_result(this->result);
2878
2879 if (ir->op == ir_query_levels) {
2880 /* # levels is in .w */
2881 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2882 emit(MOV(swizzled_result, orig_val));
2883 return;
2884 }
2885
2886 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2887 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2888 emit(MOV(swizzled_result, orig_val));
2889 return;
2890 }
2891
2892
2893 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2894 int swizzle[4] = {0};
2895
2896 for (int i = 0; i < 4; i++) {
2897 switch (GET_SWZ(s, i)) {
2898 case SWIZZLE_ZERO:
2899 zero_mask |= (1 << i);
2900 break;
2901 case SWIZZLE_ONE:
2902 one_mask |= (1 << i);
2903 break;
2904 default:
2905 copy_mask |= (1 << i);
2906 swizzle[i] = GET_SWZ(s, i);
2907 break;
2908 }
2909 }
2910
2911 if (copy_mask) {
2912 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2913 swizzled_result.writemask = copy_mask;
2914 emit(MOV(swizzled_result, orig_val));
2915 }
2916
2917 if (zero_mask) {
2918 swizzled_result.writemask = zero_mask;
2919 emit(MOV(swizzled_result, src_reg(0.0f)));
2920 }
2921
2922 if (one_mask) {
2923 swizzled_result.writemask = one_mask;
2924 emit(MOV(swizzled_result, src_reg(1.0f)));
2925 }
2926 }
2927
2928 void
2929 vec4_visitor::visit(ir_return *)
2930 {
2931 unreachable("not reached");
2932 }
2933
2934 void
2935 vec4_visitor::visit(ir_discard *)
2936 {
2937 unreachable("not reached");
2938 }
2939
2940 void
2941 vec4_visitor::visit(ir_if *ir)
2942 {
2943 /* Don't point the annotation at the if statement, because then it plus
2944 * the then and else blocks get printed.
2945 */
2946 this->base_ir = ir->condition;
2947
2948 if (devinfo->gen == 6) {
2949 emit_if_gen6(ir);
2950 } else {
2951 enum brw_predicate predicate;
2952 emit_bool_to_cond_code(ir->condition, &predicate);
2953 emit(IF(predicate));
2954 }
2955
2956 visit_instructions(&ir->then_instructions);
2957
2958 if (!ir->else_instructions.is_empty()) {
2959 this->base_ir = ir->condition;
2960 emit(BRW_OPCODE_ELSE);
2961
2962 visit_instructions(&ir->else_instructions);
2963 }
2964
2965 this->base_ir = ir->condition;
2966 emit(BRW_OPCODE_ENDIF);
2967 }
2968
2969 void
2970 vec4_visitor::visit(ir_emit_vertex *)
2971 {
2972 unreachable("not reached");
2973 }
2974
2975 void
2976 vec4_visitor::visit(ir_end_primitive *)
2977 {
2978 unreachable("not reached");
2979 }
2980
2981 void
2982 vec4_visitor::visit(ir_barrier *)
2983 {
2984 unreachable("not reached");
2985 }
2986
2987 void
2988 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2989 dst_reg dst, src_reg offset,
2990 src_reg src0, src_reg src1)
2991 {
2992 unsigned mlen = 0;
2993
2994 /* Set the atomic operation offset. */
2995 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2996 mlen++;
2997
2998 /* Set the atomic operation arguments. */
2999 if (src0.file != BAD_FILE) {
3000 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3001 mlen++;
3002 }
3003
3004 if (src1.file != BAD_FILE) {
3005 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3006 mlen++;
3007 }
3008
3009 /* Emit the instruction. Note that this maps to the normal SIMD8
3010 * untyped atomic message on Ivy Bridge, but that's OK because
3011 * unused channels will be masked out.
3012 */
3013 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3014 brw_message_reg(0),
3015 src_reg(surf_index), src_reg(atomic_op));
3016 inst->mlen = mlen;
3017 }
3018
3019 void
3020 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3021 src_reg offset)
3022 {
3023 /* Set the surface read offset. */
3024 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3025
3026 /* Emit the instruction. Note that this maps to the normal SIMD8
3027 * untyped surface read message, but that's OK because unused
3028 * channels will be masked out.
3029 */
3030 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3031 brw_message_reg(0),
3032 src_reg(surf_index), src_reg(1));
3033 inst->mlen = 1;
3034 }
3035
3036 void
3037 vec4_visitor::emit_ndc_computation()
3038 {
3039 /* Get the position */
3040 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3041
3042 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3043 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3044 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3045
3046 current_annotation = "NDC";
3047 dst_reg ndc_w = ndc;
3048 ndc_w.writemask = WRITEMASK_W;
3049 src_reg pos_w = pos;
3050 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3051 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3052
3053 dst_reg ndc_xyz = ndc;
3054 ndc_xyz.writemask = WRITEMASK_XYZ;
3055
3056 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3057 }
3058
3059 void
3060 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3061 {
3062 if (devinfo->gen < 6 &&
3063 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3064 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3065 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3066 dst_reg header1_w = header1;
3067 header1_w.writemask = WRITEMASK_W;
3068
3069 emit(MOV(header1, 0u));
3070
3071 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3072 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3073
3074 current_annotation = "Point size";
3075 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3076 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3077 }
3078
3079 if (key->userclip_active) {
3080 current_annotation = "Clipping flags";
3081 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3082 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3083
3084 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3085 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3086 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3087
3088 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3089 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3090 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3091 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3092 }
3093
3094 /* i965 clipping workaround:
3095 * 1) Test for -ve rhw
3096 * 2) If set,
3097 * set ndc = (0,0,0,0)
3098 * set ucp[6] = 1
3099 *
3100 * Later, clipping will detect ucp[6] and ensure the primitive is
3101 * clipped against all fixed planes.
3102 */
3103 if (devinfo->has_negative_rhw_bug) {
3104 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3105 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3106 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3107 vec4_instruction *inst;
3108 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3109 inst->predicate = BRW_PREDICATE_NORMAL;
3110 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3111 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3112 inst->predicate = BRW_PREDICATE_NORMAL;
3113 }
3114
3115 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3116 } else if (devinfo->gen < 6) {
3117 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3118 } else {
3119 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3120 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3121 dst_reg reg_w = reg;
3122 reg_w.writemask = WRITEMASK_W;
3123 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3124 reg_as_src.type = reg_w.type;
3125 reg_as_src.swizzle = brw_swizzle_for_size(1);
3126 emit(MOV(reg_w, reg_as_src));
3127 }
3128 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3129 dst_reg reg_y = reg;
3130 reg_y.writemask = WRITEMASK_Y;
3131 reg_y.type = BRW_REGISTER_TYPE_D;
3132 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3133 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3134 }
3135 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3136 dst_reg reg_z = reg;
3137 reg_z.writemask = WRITEMASK_Z;
3138 reg_z.type = BRW_REGISTER_TYPE_D;
3139 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3140 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3141 }
3142 }
3143 }
3144
3145 void
3146 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3147 {
3148 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3149 *
3150 * "If a linked set of shaders forming the vertex stage contains no
3151 * static write to gl_ClipVertex or gl_ClipDistance, but the
3152 * application has requested clipping against user clip planes through
3153 * the API, then the coordinate written to gl_Position is used for
3154 * comparison against the user clip planes."
3155 *
3156 * This function is only called if the shader didn't write to
3157 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3158 * if the user wrote to it; otherwise we use gl_Position.
3159 */
3160 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3161 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3162 clip_vertex = VARYING_SLOT_POS;
3163 }
3164
3165 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3166 ++i) {
3167 reg.writemask = 1 << i;
3168 emit(DP4(reg,
3169 src_reg(output_reg[clip_vertex]),
3170 src_reg(this->userplane[i + offset])));
3171 }
3172 }
3173
3174 vec4_instruction *
3175 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3176 {
3177 assert(varying < VARYING_SLOT_MAX);
3178 assert(output_reg[varying].type == reg.type);
3179 current_annotation = output_reg_annotation[varying];
3180 /* Copy the register, saturating if necessary */
3181 return emit(MOV(reg, src_reg(output_reg[varying])));
3182 }
3183
3184 void
3185 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3186 {
3187 reg.type = BRW_REGISTER_TYPE_F;
3188 output_reg[varying].type = reg.type;
3189
3190 switch (varying) {
3191 case VARYING_SLOT_PSIZ:
3192 {
3193 /* PSIZ is always in slot 0, and is coupled with other flags. */
3194 current_annotation = "indices, point width, clip flags";
3195 emit_psiz_and_flags(reg);
3196 break;
3197 }
3198 case BRW_VARYING_SLOT_NDC:
3199 current_annotation = "NDC";
3200 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3201 break;
3202 case VARYING_SLOT_POS:
3203 current_annotation = "gl_Position";
3204 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3205 break;
3206 case VARYING_SLOT_EDGE:
3207 /* This is present when doing unfilled polygons. We're supposed to copy
3208 * the edge flag from the user-provided vertex array
3209 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3210 * of that attribute (starts as 1.0f). This is then used in clipping to
3211 * determine which edges should be drawn as wireframe.
3212 */
3213 current_annotation = "edge flag";
3214 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3215 glsl_type::float_type, WRITEMASK_XYZW))));
3216 break;
3217 case BRW_VARYING_SLOT_PAD:
3218 /* No need to write to this slot */
3219 break;
3220 case VARYING_SLOT_COL0:
3221 case VARYING_SLOT_COL1:
3222 case VARYING_SLOT_BFC0:
3223 case VARYING_SLOT_BFC1: {
3224 /* These built-in varyings are only supported in compatibility mode,
3225 * and we only support GS in core profile. So, this must be a vertex
3226 * shader.
3227 */
3228 assert(stage == MESA_SHADER_VERTEX);
3229 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3230 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3231 inst->saturate = true;
3232 break;
3233 }
3234
3235 default:
3236 emit_generic_urb_slot(reg, varying);
3237 break;
3238 }
3239 }
3240
3241 static int
3242 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3243 {
3244 if (devinfo->gen >= 6) {
3245 /* URB data written (does not include the message header reg) must
3246 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3247 * section 5.4.3.2.2: URB_INTERLEAVED.
3248 *
3249 * URB entries are allocated on a multiple of 1024 bits, so an
3250 * extra 128 bits written here to make the end align to 256 is
3251 * no problem.
3252 */
3253 if ((mlen % 2) != 1)
3254 mlen++;
3255 }
3256
3257 return mlen;
3258 }
3259
3260
3261 /**
3262 * Generates the VUE payload plus the necessary URB write instructions to
3263 * output it.
3264 *
3265 * The VUE layout is documented in Volume 2a.
3266 */
3267 void
3268 vec4_visitor::emit_vertex()
3269 {
3270 /* MRF 0 is reserved for the debugger, so start with message header
3271 * in MRF 1.
3272 */
3273 int base_mrf = 1;
3274 int mrf = base_mrf;
3275 /* In the process of generating our URB write message contents, we
3276 * may need to unspill a register or load from an array. Those
3277 * reads would use MRFs 14-15.
3278 */
3279 int max_usable_mrf = 13;
3280
3281 /* The following assertion verifies that max_usable_mrf causes an
3282 * even-numbered amount of URB write data, which will meet gen6's
3283 * requirements for length alignment.
3284 */
3285 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3286
3287 /* First mrf is the g0-based message header containing URB handles and
3288 * such.
3289 */
3290 emit_urb_write_header(mrf++);
3291
3292 if (devinfo->gen < 6) {
3293 emit_ndc_computation();
3294 }
3295
3296 /* Lower legacy ff and ClipVertex clipping to clip distances */
3297 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3298 current_annotation = "user clip distances";
3299
3300 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3301 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3302
3303 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3304 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3305 }
3306
3307 /* We may need to split this up into several URB writes, so do them in a
3308 * loop.
3309 */
3310 int slot = 0;
3311 bool complete = false;
3312 do {
3313 /* URB offset is in URB row increments, and each of our MRFs is half of
3314 * one of those, since we're doing interleaved writes.
3315 */
3316 int offset = slot / 2;
3317
3318 mrf = base_mrf + 1;
3319 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3320 emit_urb_slot(dst_reg(MRF, mrf++),
3321 prog_data->vue_map.slot_to_varying[slot]);
3322
3323 /* If this was max_usable_mrf, we can't fit anything more into this
3324 * URB WRITE.
3325 */
3326 if (mrf > max_usable_mrf) {
3327 slot++;
3328 break;
3329 }
3330 }
3331
3332 complete = slot >= prog_data->vue_map.num_slots;
3333 current_annotation = "URB write";
3334 vec4_instruction *inst = emit_urb_write_opcode(complete);
3335 inst->base_mrf = base_mrf;
3336 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3337 inst->offset += offset;
3338 } while(!complete);
3339 }
3340
3341
3342 src_reg
3343 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3344 src_reg *reladdr, int reg_offset)
3345 {
3346 /* Because we store the values to scratch interleaved like our
3347 * vertex data, we need to scale the vec4 index by 2.
3348 */
3349 int message_header_scale = 2;
3350
3351 /* Pre-gen6, the message header uses byte offsets instead of vec4
3352 * (16-byte) offset units.
3353 */
3354 if (devinfo->gen < 6)
3355 message_header_scale *= 16;
3356
3357 if (reladdr) {
3358 src_reg index = src_reg(this, glsl_type::int_type);
3359
3360 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3361 src_reg(reg_offset)));
3362 emit_before(block, inst, MUL(dst_reg(index), index,
3363 src_reg(message_header_scale)));
3364
3365 return index;
3366 } else {
3367 return src_reg(reg_offset * message_header_scale);
3368 }
3369 }
3370
3371 src_reg
3372 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3373 src_reg *reladdr, int reg_offset)
3374 {
3375 if (reladdr) {
3376 src_reg index = src_reg(this, glsl_type::int_type);
3377
3378 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3379 src_reg(reg_offset)));
3380
3381 /* Pre-gen6, the message header uses byte offsets instead of vec4
3382 * (16-byte) offset units.
3383 */
3384 if (devinfo->gen < 6) {
3385 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3386 }
3387
3388 return index;
3389 } else if (devinfo->gen >= 8) {
3390 /* Store the offset in a GRF so we can send-from-GRF. */
3391 src_reg offset = src_reg(this, glsl_type::int_type);
3392 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3393 return offset;
3394 } else {
3395 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3396 return src_reg(reg_offset * message_header_scale);
3397 }
3398 }
3399
3400 /**
3401 * Emits an instruction before @inst to load the value named by @orig_src
3402 * from scratch space at @base_offset to @temp.
3403 *
3404 * @base_offset is measured in 32-byte units (the size of a register).
3405 */
3406 void
3407 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3408 dst_reg temp, src_reg orig_src,
3409 int base_offset)
3410 {
3411 int reg_offset = base_offset + orig_src.reg_offset;
3412 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3413 reg_offset);
3414
3415 emit_before(block, inst, SCRATCH_READ(temp, index));
3416 }
3417
3418 /**
3419 * Emits an instruction after @inst to store the value to be written
3420 * to @orig_dst to scratch space at @base_offset, from @temp.
3421 *
3422 * @base_offset is measured in 32-byte units (the size of a register).
3423 */
3424 void
3425 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3426 int base_offset)
3427 {
3428 int reg_offset = base_offset + inst->dst.reg_offset;
3429 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3430 reg_offset);
3431
3432 /* Create a temporary register to store *inst's result in.
3433 *
3434 * We have to be careful in MOVing from our temporary result register in
3435 * the scratch write. If we swizzle from channels of the temporary that
3436 * weren't initialized, it will confuse live interval analysis, which will
3437 * make spilling fail to make progress.
3438 */
3439 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3440 inst->dst.type),
3441 brw_swizzle_for_mask(inst->dst.writemask));
3442 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3443 inst->dst.writemask));
3444 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3445 write->predicate = inst->predicate;
3446 write->ir = inst->ir;
3447 write->annotation = inst->annotation;
3448 inst->insert_after(block, write);
3449
3450 inst->dst.file = temp.file;
3451 inst->dst.reg = temp.reg;
3452 inst->dst.reg_offset = temp.reg_offset;
3453 inst->dst.reladdr = NULL;
3454 }
3455
3456 /**
3457 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3458 * adds the scratch read(s) before \p inst. The function also checks for
3459 * recursive reladdr scratch accesses, issuing the corresponding scratch
3460 * loads and rewriting reladdr references accordingly.
3461 *
3462 * \return \p src if it did not require a scratch load, otherwise, the
3463 * register holding the result of the scratch load that the caller should
3464 * use to rewrite src.
3465 */
3466 src_reg
3467 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3468 vec4_instruction *inst, src_reg src)
3469 {
3470 /* Resolve recursive reladdr scratch access by calling ourselves
3471 * with src.reladdr
3472 */
3473 if (src.reladdr)
3474 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3475 *src.reladdr);
3476
3477 /* Now handle scratch access on src */
3478 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3479 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3480 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3481 src.reg = temp.reg;
3482 src.reg_offset = temp.reg_offset;
3483 src.reladdr = NULL;
3484 }
3485
3486 return src;
3487 }
3488
3489 /**
3490 * We can't generally support array access in GRF space, because a
3491 * single instruction's destination can only span 2 contiguous
3492 * registers. So, we send all GRF arrays that get variable index
3493 * access to scratch space.
3494 */
3495 void
3496 vec4_visitor::move_grf_array_access_to_scratch()
3497 {
3498 int scratch_loc[this->alloc.count];
3499 memset(scratch_loc, -1, sizeof(scratch_loc));
3500
3501 /* First, calculate the set of virtual GRFs that need to be punted
3502 * to scratch due to having any array access on them, and where in
3503 * scratch.
3504 */
3505 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3506 if (inst->dst.file == GRF && inst->dst.reladdr) {
3507 if (scratch_loc[inst->dst.reg] == -1) {
3508 scratch_loc[inst->dst.reg] = last_scratch;
3509 last_scratch += this->alloc.sizes[inst->dst.reg];
3510 }
3511
3512 for (src_reg *iter = inst->dst.reladdr;
3513 iter->reladdr;
3514 iter = iter->reladdr) {
3515 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3516 scratch_loc[iter->reg] = last_scratch;
3517 last_scratch += this->alloc.sizes[iter->reg];
3518 }
3519 }
3520 }
3521
3522 for (int i = 0 ; i < 3; i++) {
3523 for (src_reg *iter = &inst->src[i];
3524 iter->reladdr;
3525 iter = iter->reladdr) {
3526 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3527 scratch_loc[iter->reg] = last_scratch;
3528 last_scratch += this->alloc.sizes[iter->reg];
3529 }
3530 }
3531 }
3532 }
3533
3534 /* Now, for anything that will be accessed through scratch, rewrite
3535 * it to load/store. Note that this is a _safe list walk, because
3536 * we may generate a new scratch_write instruction after the one
3537 * we're processing.
3538 */
3539 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3540 /* Set up the annotation tracking for new generated instructions. */
3541 base_ir = inst->ir;
3542 current_annotation = inst->annotation;
3543
3544 /* First handle scratch access on the dst. Notice we have to handle
3545 * the case where the dst's reladdr also points to scratch space.
3546 */
3547 if (inst->dst.reladdr)
3548 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3549 *inst->dst.reladdr);
3550
3551 /* Now that we have handled any (possibly recursive) reladdr scratch
3552 * accesses for dst we can safely do the scratch write for dst itself
3553 */
3554 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3555 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3556
3557 /* Now handle scratch access on any src. In this case, since inst->src[i]
3558 * already is a src_reg, we can just call emit_resolve_reladdr with
3559 * inst->src[i] and it will take care of handling scratch loads for
3560 * both src and src.reladdr (recursively).
3561 */
3562 for (int i = 0 ; i < 3; i++) {
3563 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3564 inst->src[i]);
3565 }
3566 }
3567 }
3568
3569 /**
3570 * Emits an instruction before @inst to load the value named by @orig_src
3571 * from the pull constant buffer (surface) at @base_offset to @temp.
3572 */
3573 void
3574 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3575 dst_reg temp, src_reg orig_src,
3576 int base_offset)
3577 {
3578 int reg_offset = base_offset + orig_src.reg_offset;
3579 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3580 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3581 reg_offset);
3582
3583 emit_pull_constant_load_reg(temp,
3584 index,
3585 offset,
3586 block, inst);
3587 }
3588
3589 /**
3590 * Implements array access of uniforms by inserting a
3591 * PULL_CONSTANT_LOAD instruction.
3592 *
3593 * Unlike temporary GRF array access (where we don't support it due to
3594 * the difficulty of doing relative addressing on instruction
3595 * destinations), we could potentially do array access of uniforms
3596 * that were loaded in GRF space as push constants. In real-world
3597 * usage we've seen, though, the arrays being used are always larger
3598 * than we could load as push constants, so just always move all
3599 * uniform array access out to a pull constant buffer.
3600 */
3601 void
3602 vec4_visitor::move_uniform_array_access_to_pull_constants()
3603 {
3604 int pull_constant_loc[this->uniforms];
3605 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3606 bool nested_reladdr;
3607
3608 /* Walk through and find array access of uniforms. Put a copy of that
3609 * uniform in the pull constant buffer.
3610 *
3611 * Note that we don't move constant-indexed accesses to arrays. No
3612 * testing has been done of the performance impact of this choice.
3613 */
3614 do {
3615 nested_reladdr = false;
3616
3617 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3618 for (int i = 0 ; i < 3; i++) {
3619 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3620 continue;
3621
3622 int uniform = inst->src[i].reg;
3623
3624 if (inst->src[i].reladdr->reladdr)
3625 nested_reladdr = true; /* will need another pass */
3626
3627 /* If this array isn't already present in the pull constant buffer,
3628 * add it.
3629 */
3630 if (pull_constant_loc[uniform] == -1) {
3631 const gl_constant_value **values =
3632 &stage_prog_data->param[uniform * 4];
3633
3634 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3635
3636 assert(uniform < uniform_array_size);
3637 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3638 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3639 = values[j];
3640 }
3641 }
3642
3643 /* Set up the annotation tracking for new generated instructions. */
3644 base_ir = inst->ir;
3645 current_annotation = inst->annotation;
3646
3647 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3648
3649 emit_pull_constant_load(block, inst, temp, inst->src[i],
3650 pull_constant_loc[uniform]);
3651
3652 inst->src[i].file = temp.file;
3653 inst->src[i].reg = temp.reg;
3654 inst->src[i].reg_offset = temp.reg_offset;
3655 inst->src[i].reladdr = NULL;
3656 }
3657 }
3658 } while (nested_reladdr);
3659
3660 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3661 * no need to track them as larger-than-vec4 objects. This will be
3662 * relied on in cutting out unused uniform vectors from push
3663 * constants.
3664 */
3665 split_uniform_registers();
3666 }
3667
3668 void
3669 vec4_visitor::resolve_ud_negate(src_reg *reg)
3670 {
3671 if (reg->type != BRW_REGISTER_TYPE_UD ||
3672 !reg->negate)
3673 return;
3674
3675 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3676 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3677 *reg = temp;
3678 }
3679
3680 /**
3681 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3682 *
3683 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3684 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3685 */
3686 void
3687 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3688 {
3689 assert(devinfo->gen <= 5);
3690
3691 if (!rvalue->type->is_boolean())
3692 return;
3693
3694 src_reg and_result = src_reg(this, rvalue->type);
3695 src_reg neg_result = src_reg(this, rvalue->type);
3696 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3697 emit(MOV(dst_reg(neg_result), negate(and_result)));
3698 *reg = neg_result;
3699 }
3700
3701 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3702 void *log_data,
3703 struct gl_program *prog,
3704 const struct brw_vue_prog_key *key,
3705 struct brw_vue_prog_data *prog_data,
3706 struct gl_shader_program *shader_prog,
3707 gl_shader_stage stage,
3708 void *mem_ctx,
3709 bool no_spills,
3710 int shader_time_index)
3711 : backend_shader(compiler, log_data, mem_ctx,
3712 shader_prog, prog, &prog_data->base, stage),
3713 key(key),
3714 prog_data(prog_data),
3715 sanity_param_count(0),
3716 fail_msg(NULL),
3717 first_non_payload_grf(0),
3718 need_all_constants_in_pull_buffer(false),
3719 no_spills(no_spills),
3720 shader_time_index(shader_time_index),
3721 last_scratch(0)
3722 {
3723 this->failed = false;
3724
3725 this->base_ir = NULL;
3726 this->current_annotation = NULL;
3727 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3728
3729 this->variable_ht = hash_table_ctor(0,
3730 hash_table_pointer_hash,
3731 hash_table_pointer_compare);
3732
3733 this->virtual_grf_start = NULL;
3734 this->virtual_grf_end = NULL;
3735 this->live_intervals = NULL;
3736
3737 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3738
3739 this->uniforms = 0;
3740
3741 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3742 * at least one. See setup_uniforms() in brw_vec4.cpp.
3743 */
3744 this->uniform_array_size = 1;
3745 if (prog_data) {
3746 this->uniform_array_size =
3747 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3748 }
3749
3750 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3751 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3752 }
3753
3754 vec4_visitor::~vec4_visitor()
3755 {
3756 hash_table_dtor(this->variable_ht);
3757 }
3758
3759
3760 void
3761 vec4_visitor::fail(const char *format, ...)
3762 {
3763 va_list va;
3764 char *msg;
3765
3766 if (failed)
3767 return;
3768
3769 failed = true;
3770
3771 va_start(va, format);
3772 msg = ralloc_vasprintf(mem_ctx, format, va);
3773 va_end(va);
3774 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3775
3776 this->fail_msg = msg;
3777
3778 if (debug_enabled) {
3779 fprintf(stderr, "%s", msg);
3780 }
3781 }
3782
3783 } /* namespace brw */