17604cdaecc659a89677ed91101fb62bf5cc3c93
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575 /**
576 * Returns the minimum number of vec4 elements needed to pack a type.
577 *
578 * For simple types, it will return 1 (a single vec4); for matrices, the
579 * number of columns; for array and struct, the sum of the vec4_size of
580 * each of its elements; and for sampler and atomic, zero.
581 *
582 * This method is useful to calculate how much register space is needed to
583 * store a particular type.
584 */
585 int
586 vec4_visitor::type_size(const struct glsl_type *type)
587 {
588 unsigned int i;
589 int size;
590
591 switch (type->base_type) {
592 case GLSL_TYPE_UINT:
593 case GLSL_TYPE_INT:
594 case GLSL_TYPE_FLOAT:
595 case GLSL_TYPE_BOOL:
596 if (type->is_matrix()) {
597 return type->matrix_columns;
598 } else {
599 /* Regardless of size of vector, it gets a vec4. This is bad
600 * packing for things like floats, but otherwise arrays become a
601 * mess. Hopefully a later pass over the code can pack scalars
602 * down if appropriate.
603 */
604 return 1;
605 }
606 case GLSL_TYPE_ARRAY:
607 assert(type->length > 0);
608 return type_size(type->fields.array) * type->length;
609 case GLSL_TYPE_STRUCT:
610 size = 0;
611 for (i = 0; i < type->length; i++) {
612 size += type_size(type->fields.structure[i].type);
613 }
614 return size;
615 case GLSL_TYPE_SUBROUTINE:
616 return 1;
617
618 case GLSL_TYPE_SAMPLER:
619 /* Samplers take up no register space, since they're baked in at
620 * link time.
621 */
622 return 0;
623 case GLSL_TYPE_ATOMIC_UINT:
624 return 0;
625 case GLSL_TYPE_IMAGE:
626 case GLSL_TYPE_VOID:
627 case GLSL_TYPE_DOUBLE:
628 case GLSL_TYPE_ERROR:
629 case GLSL_TYPE_INTERFACE:
630 unreachable("not reached");
631 }
632
633 return 0;
634 }
635
636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
637 {
638 init();
639
640 this->file = GRF;
641 this->reg = v->alloc.allocate(v->type_size(type));
642
643 if (type->is_array() || type->is_record()) {
644 this->swizzle = BRW_SWIZZLE_NOOP;
645 } else {
646 this->swizzle = brw_swizzle_for_size(type->vector_elements);
647 }
648
649 this->type = brw_type_for_base_type(type);
650 }
651
652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
653 {
654 assert(size > 0);
655
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(v->type_size(type) * size);
660
661 this->swizzle = BRW_SWIZZLE_NOOP;
662
663 this->type = brw_type_for_base_type(type);
664 }
665
666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
667 {
668 init();
669
670 this->file = GRF;
671 this->reg = v->alloc.allocate(v->type_size(type));
672
673 if (type->is_array() || type->is_record()) {
674 this->writemask = WRITEMASK_XYZW;
675 } else {
676 this->writemask = (1 << type->vector_elements) - 1;
677 }
678
679 this->type = brw_type_for_base_type(type);
680 }
681
682 void
683 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
684 unsigned n)
685 {
686 static const gl_constant_value zero = { 0 };
687
688 for (unsigned i = 0; i < n; ++i)
689 stage_prog_data->param[4 * uniforms + i] = &values[i];
690
691 for (unsigned i = n; i < 4; ++i)
692 stage_prog_data->param[4 * uniforms + i] = &zero;
693
694 uniform_vector_size[uniforms++] = n;
695 }
696
697 /* Our support for uniforms is piggy-backed on the struct
698 * gl_fragment_program, because that's where the values actually
699 * get stored, rather than in some global gl_shader_program uniform
700 * store.
701 */
702 void
703 vec4_visitor::setup_uniform_values(ir_variable *ir)
704 {
705 int namelen = strlen(ir->name);
706
707 /* The data for our (non-builtin) uniforms is stored in a series of
708 * gl_uniform_driver_storage structs for each subcomponent that
709 * glGetUniformLocation() could name. We know it's been set up in the same
710 * order we'd walk the type, so walk the list of storage and find anything
711 * with our name, or the prefix of a component that starts with our name.
712 */
713 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
714 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
715
716 if (storage->builtin)
717 continue;
718
719 if (strncmp(ir->name, storage->name, namelen) != 0 ||
720 (storage->name[namelen] != 0 &&
721 storage->name[namelen] != '.' &&
722 storage->name[namelen] != '[')) {
723 continue;
724 }
725
726 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
727 storage->type->matrix_columns);
728 const unsigned vector_size = storage->type->vector_elements;
729
730 for (unsigned s = 0; s < vector_count; s++)
731 setup_vector_uniform_values(&storage->storage[s * vector_size],
732 vector_size);
733 }
734 }
735
736 void
737 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
738 {
739 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
740 assert(this->uniforms < uniform_array_size);
741 this->uniform_vector_size[this->uniforms] = 4;
742 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
743 this->userplane[i].type = BRW_REGISTER_TYPE_F;
744 for (int j = 0; j < 4; ++j) {
745 stage_prog_data->param[this->uniforms * 4 + j] =
746 (gl_constant_value *) &clip_planes[i][j];
747 }
748 ++this->uniforms;
749 }
750 }
751
752 /* Our support for builtin uniforms is even scarier than non-builtin.
753 * It sits on top of the PROG_STATE_VAR parameters that are
754 * automatically updated from GL context state.
755 */
756 void
757 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
758 {
759 const ir_state_slot *const slots = ir->get_state_slots();
760 assert(slots != NULL);
761
762 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
763 /* This state reference has already been setup by ir_to_mesa,
764 * but we'll get the same index back here. We can reference
765 * ParameterValues directly, since unlike brw_fs.cpp, we never
766 * add new state references during compile.
767 */
768 int index = _mesa_add_state_reference(this->prog->Parameters,
769 (gl_state_index *)slots[i].tokens);
770 gl_constant_value *values =
771 &this->prog->Parameters->ParameterValues[index][0];
772
773 assert(this->uniforms < uniform_array_size);
774
775 for (unsigned j = 0; j < 4; j++)
776 stage_prog_data->param[this->uniforms * 4 + j] =
777 &values[GET_SWZ(slots[i].swizzle, j)];
778
779 this->uniform_vector_size[this->uniforms] =
780 (ir->type->is_scalar() || ir->type->is_vector() ||
781 ir->type->is_matrix() ? ir->type->vector_elements : 4);
782
783 this->uniforms++;
784 }
785 }
786
787 dst_reg *
788 vec4_visitor::variable_storage(ir_variable *var)
789 {
790 return (dst_reg *)hash_table_find(this->variable_ht, var);
791 }
792
793 void
794 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
795 enum brw_predicate *predicate)
796 {
797 ir_expression *expr = ir->as_expression();
798
799 *predicate = BRW_PREDICATE_NORMAL;
800
801 if (expr && expr->operation != ir_binop_ubo_load) {
802 src_reg op[3];
803 vec4_instruction *inst;
804
805 assert(expr->get_num_operands() <= 3);
806 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
807 expr->operands[i]->accept(this);
808 op[i] = this->result;
809
810 resolve_ud_negate(&op[i]);
811 }
812
813 switch (expr->operation) {
814 case ir_unop_logic_not:
815 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
816 inst->conditional_mod = BRW_CONDITIONAL_Z;
817 break;
818
819 case ir_binop_logic_xor:
820 if (devinfo->gen <= 5) {
821 src_reg temp = src_reg(this, ir->type);
822 emit(XOR(dst_reg(temp), op[0], op[1]));
823 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
824 } else {
825 inst = emit(XOR(dst_null_d(), op[0], op[1]));
826 }
827 inst->conditional_mod = BRW_CONDITIONAL_NZ;
828 break;
829
830 case ir_binop_logic_or:
831 if (devinfo->gen <= 5) {
832 src_reg temp = src_reg(this, ir->type);
833 emit(OR(dst_reg(temp), op[0], op[1]));
834 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
835 } else {
836 inst = emit(OR(dst_null_d(), op[0], op[1]));
837 }
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 break;
840
841 case ir_binop_logic_and:
842 if (devinfo->gen <= 5) {
843 src_reg temp = src_reg(this, ir->type);
844 emit(AND(dst_reg(temp), op[0], op[1]));
845 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
846 } else {
847 inst = emit(AND(dst_null_d(), op[0], op[1]));
848 }
849 inst->conditional_mod = BRW_CONDITIONAL_NZ;
850 break;
851
852 case ir_unop_f2b:
853 if (devinfo->gen >= 6) {
854 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
855 } else {
856 inst = emit(MOV(dst_null_f(), op[0]));
857 inst->conditional_mod = BRW_CONDITIONAL_NZ;
858 }
859 break;
860
861 case ir_unop_i2b:
862 if (devinfo->gen >= 6) {
863 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
864 } else {
865 inst = emit(MOV(dst_null_d(), op[0]));
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 }
868 break;
869
870 case ir_binop_all_equal:
871 if (devinfo->gen <= 5) {
872 resolve_bool_comparison(expr->operands[0], &op[0]);
873 resolve_bool_comparison(expr->operands[1], &op[1]);
874 }
875 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
876 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
877 break;
878
879 case ir_binop_any_nequal:
880 if (devinfo->gen <= 5) {
881 resolve_bool_comparison(expr->operands[0], &op[0]);
882 resolve_bool_comparison(expr->operands[1], &op[1]);
883 }
884 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
885 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
886 break;
887
888 case ir_unop_any:
889 if (devinfo->gen <= 5) {
890 resolve_bool_comparison(expr->operands[0], &op[0]);
891 }
892 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
893 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
894 break;
895
896 case ir_binop_greater:
897 case ir_binop_gequal:
898 case ir_binop_less:
899 case ir_binop_lequal:
900 case ir_binop_equal:
901 case ir_binop_nequal:
902 if (devinfo->gen <= 5) {
903 resolve_bool_comparison(expr->operands[0], &op[0]);
904 resolve_bool_comparison(expr->operands[1], &op[1]);
905 }
906 emit(CMP(dst_null_d(), op[0], op[1],
907 brw_conditional_for_comparison(expr->operation)));
908 break;
909
910 case ir_triop_csel: {
911 /* Expand the boolean condition into the flag register. */
912 inst = emit(MOV(dst_null_d(), op[0]));
913 inst->conditional_mod = BRW_CONDITIONAL_NZ;
914
915 /* Select which boolean to return. */
916 dst_reg temp(this, expr->operands[1]->type);
917 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
918 inst->predicate = BRW_PREDICATE_NORMAL;
919
920 /* Expand the result to a condition code. */
921 inst = emit(MOV(dst_null_d(), src_reg(temp)));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923 break;
924 }
925
926 default:
927 unreachable("not reached");
928 }
929 return;
930 }
931
932 ir->accept(this);
933
934 resolve_ud_negate(&this->result);
935
936 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
937 inst->conditional_mod = BRW_CONDITIONAL_NZ;
938 }
939
940 /**
941 * Emit a gen6 IF statement with the comparison folded into the IF
942 * instruction.
943 */
944 void
945 vec4_visitor::emit_if_gen6(ir_if *ir)
946 {
947 ir_expression *expr = ir->condition->as_expression();
948
949 if (expr && expr->operation != ir_binop_ubo_load) {
950 src_reg op[3];
951 dst_reg temp;
952
953 assert(expr->get_num_operands() <= 3);
954 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
955 expr->operands[i]->accept(this);
956 op[i] = this->result;
957 }
958
959 switch (expr->operation) {
960 case ir_unop_logic_not:
961 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
962 return;
963
964 case ir_binop_logic_xor:
965 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
966 return;
967
968 case ir_binop_logic_or:
969 temp = dst_reg(this, glsl_type::bool_type);
970 emit(OR(temp, op[0], op[1]));
971 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_binop_logic_and:
975 temp = dst_reg(this, glsl_type::bool_type);
976 emit(AND(temp, op[0], op[1]));
977 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
978 return;
979
980 case ir_unop_f2b:
981 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
982 return;
983
984 case ir_unop_i2b:
985 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
986 return;
987
988 case ir_binop_greater:
989 case ir_binop_gequal:
990 case ir_binop_less:
991 case ir_binop_lequal:
992 case ir_binop_equal:
993 case ir_binop_nequal:
994 emit(IF(op[0], op[1],
995 brw_conditional_for_comparison(expr->operation)));
996 return;
997
998 case ir_binop_all_equal:
999 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1000 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1001 return;
1002
1003 case ir_binop_any_nequal:
1004 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1005 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1006 return;
1007
1008 case ir_unop_any:
1009 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1010 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1011 return;
1012
1013 case ir_triop_csel: {
1014 /* Expand the boolean condition into the flag register. */
1015 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1016 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1017
1018 /* Select which boolean to return. */
1019 dst_reg temp(this, expr->operands[1]->type);
1020 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1021 inst->predicate = BRW_PREDICATE_NORMAL;
1022
1023 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1024 return;
1025 }
1026
1027 default:
1028 unreachable("not reached");
1029 }
1030 return;
1031 }
1032
1033 ir->condition->accept(this);
1034
1035 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1036 }
1037
1038 void
1039 vec4_visitor::visit(ir_variable *ir)
1040 {
1041 dst_reg *reg = NULL;
1042
1043 if (variable_storage(ir))
1044 return;
1045
1046 switch (ir->data.mode) {
1047 case ir_var_shader_in:
1048 assert(ir->data.location != -1);
1049 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1050 break;
1051
1052 case ir_var_shader_out:
1053 assert(ir->data.location != -1);
1054 reg = new(mem_ctx) dst_reg(this, ir->type);
1055
1056 for (int i = 0; i < type_size(ir->type); i++) {
1057 output_reg[ir->data.location + i] = *reg;
1058 output_reg[ir->data.location + i].reg_offset = i;
1059 output_reg[ir->data.location + i].type =
1060 brw_type_for_base_type(ir->type->get_scalar_type());
1061 output_reg_annotation[ir->data.location + i] = ir->name;
1062 }
1063 break;
1064
1065 case ir_var_auto:
1066 case ir_var_temporary:
1067 reg = new(mem_ctx) dst_reg(this, ir->type);
1068 break;
1069
1070 case ir_var_uniform:
1071 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1072
1073 /* Thanks to the lower_ubo_reference pass, we will see only
1074 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1075 * variables, so no need for them to be in variable_ht.
1076 *
1077 * Some uniforms, such as samplers and atomic counters, have no actual
1078 * storage, so we should ignore them.
1079 */
1080 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1081 return;
1082
1083 /* Track how big the whole uniform variable is, in case we need to put a
1084 * copy of its data into pull constants for array access.
1085 */
1086 assert(this->uniforms < uniform_array_size);
1087 this->uniform_size[this->uniforms] = type_size(ir->type);
1088
1089 if (!strncmp(ir->name, "gl_", 3)) {
1090 setup_builtin_uniform_values(ir);
1091 } else {
1092 setup_uniform_values(ir);
1093 }
1094 break;
1095
1096 case ir_var_system_value:
1097 reg = make_reg_for_system_value(ir->data.location, ir->type);
1098 break;
1099
1100 default:
1101 unreachable("not reached");
1102 }
1103
1104 reg->type = brw_type_for_base_type(ir->type);
1105 hash_table_insert(this->variable_ht, reg, ir);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop *ir)
1110 {
1111 /* We don't want debugging output to print the whole body of the
1112 * loop as the annotation.
1113 */
1114 this->base_ir = NULL;
1115
1116 emit(BRW_OPCODE_DO);
1117
1118 visit_instructions(&ir->body_instructions);
1119
1120 emit(BRW_OPCODE_WHILE);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop_jump *ir)
1125 {
1126 switch (ir->mode) {
1127 case ir_loop_jump::jump_break:
1128 emit(BRW_OPCODE_BREAK);
1129 break;
1130 case ir_loop_jump::jump_continue:
1131 emit(BRW_OPCODE_CONTINUE);
1132 break;
1133 }
1134 }
1135
1136
1137 void
1138 vec4_visitor::visit(ir_function_signature *)
1139 {
1140 unreachable("not reached");
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_function *ir)
1145 {
1146 /* Ignore function bodies other than main() -- we shouldn't see calls to
1147 * them since they should all be inlined.
1148 */
1149 if (strcmp(ir->name, "main") == 0) {
1150 const ir_function_signature *sig;
1151 exec_list empty;
1152
1153 sig = ir->matching_signature(NULL, &empty, false);
1154
1155 assert(sig);
1156
1157 visit_instructions(&sig->body);
1158 }
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_mad(ir_expression *ir)
1163 {
1164 /* 3-src instructions were introduced in gen6. */
1165 if (devinfo->gen < 6)
1166 return false;
1167
1168 /* MAD can only handle floating-point data. */
1169 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1170 return false;
1171
1172 ir_rvalue *nonmul;
1173 ir_expression *mul;
1174 bool mul_negate, mul_abs;
1175
1176 for (int i = 0; i < 2; i++) {
1177 mul_negate = false;
1178 mul_abs = false;
1179
1180 mul = ir->operands[i]->as_expression();
1181 nonmul = ir->operands[1 - i];
1182
1183 if (mul && mul->operation == ir_unop_abs) {
1184 mul = mul->operands[0]->as_expression();
1185 mul_abs = true;
1186 } else if (mul && mul->operation == ir_unop_neg) {
1187 mul = mul->operands[0]->as_expression();
1188 mul_negate = true;
1189 }
1190
1191 if (mul && mul->operation == ir_binop_mul)
1192 break;
1193 }
1194
1195 if (!mul || mul->operation != ir_binop_mul)
1196 return false;
1197
1198 nonmul->accept(this);
1199 src_reg src0 = fix_3src_operand(this->result);
1200
1201 mul->operands[0]->accept(this);
1202 src_reg src1 = fix_3src_operand(this->result);
1203 src1.negate ^= mul_negate;
1204 src1.abs = mul_abs;
1205 if (mul_abs)
1206 src1.negate = false;
1207
1208 mul->operands[1]->accept(this);
1209 src_reg src2 = fix_3src_operand(this->result);
1210 src2.abs = mul_abs;
1211 if (mul_abs)
1212 src2.negate = false;
1213
1214 this->result = src_reg(this, ir->type);
1215 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1216
1217 return true;
1218 }
1219
1220 bool
1221 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1222 {
1223 /* This optimization relies on CMP setting the destination to 0 when
1224 * false. Early hardware only sets the least significant bit, and
1225 * leaves the other bits undefined. So we can't use it.
1226 */
1227 if (devinfo->gen < 6)
1228 return false;
1229
1230 ir_expression *const cmp = ir->operands[0]->as_expression();
1231
1232 if (cmp == NULL)
1233 return false;
1234
1235 switch (cmp->operation) {
1236 case ir_binop_less:
1237 case ir_binop_greater:
1238 case ir_binop_lequal:
1239 case ir_binop_gequal:
1240 case ir_binop_equal:
1241 case ir_binop_nequal:
1242 break;
1243
1244 default:
1245 return false;
1246 }
1247
1248 cmp->operands[0]->accept(this);
1249 const src_reg cmp_src0 = this->result;
1250
1251 cmp->operands[1]->accept(this);
1252 const src_reg cmp_src1 = this->result;
1253
1254 this->result = src_reg(this, ir->type);
1255
1256 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1257 brw_conditional_for_comparison(cmp->operation)));
1258
1259 /* If the comparison is false, this->result will just happen to be zero.
1260 */
1261 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1262 this->result, src_reg(1.0f));
1263 inst->predicate = BRW_PREDICATE_NORMAL;
1264 inst->predicate_inverse = true;
1265
1266 return true;
1267 }
1268
1269 void
1270 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1271 src_reg src0, src_reg src1)
1272 {
1273 vec4_instruction *inst;
1274
1275 if (devinfo->gen >= 6) {
1276 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1277 inst->conditional_mod = conditionalmod;
1278 } else {
1279 emit(CMP(dst, src0, src1, conditionalmod));
1280
1281 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1282 inst->predicate = BRW_PREDICATE_NORMAL;
1283 }
1284 }
1285
1286 void
1287 vec4_visitor::emit_lrp(const dst_reg &dst,
1288 const src_reg &x, const src_reg &y, const src_reg &a)
1289 {
1290 if (devinfo->gen >= 6) {
1291 /* Note that the instruction's argument order is reversed from GLSL
1292 * and the IR.
1293 */
1294 emit(LRP(dst,
1295 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1296 } else {
1297 /* Earlier generations don't support three source operations, so we
1298 * need to emit x*(1-a) + y*a.
1299 */
1300 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1301 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1302 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1303 y_times_a.writemask = dst.writemask;
1304 one_minus_a.writemask = dst.writemask;
1305 x_times_one_minus_a.writemask = dst.writemask;
1306
1307 emit(MUL(y_times_a, y, a));
1308 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1309 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1310 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1311 }
1312 }
1313
1314 /**
1315 * Emits the instructions needed to perform a pull constant load. before_block
1316 * and before_inst can be NULL in which case the instruction will be appended
1317 * to the end of the instruction list.
1318 */
1319 void
1320 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1321 src_reg surf_index,
1322 src_reg offset_reg,
1323 bblock_t *before_block,
1324 vec4_instruction *before_inst)
1325 {
1326 assert((before_inst == NULL && before_block == NULL) ||
1327 (before_inst && before_block));
1328
1329 vec4_instruction *pull;
1330
1331 if (devinfo->gen >= 9) {
1332 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1333 src_reg header(this, glsl_type::uvec4_type, 2);
1334
1335 pull = new(mem_ctx)
1336 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1337 dst_reg(header));
1338
1339 if (before_inst)
1340 emit_before(before_block, before_inst, pull);
1341 else
1342 emit(pull);
1343
1344 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1345 offset_reg.type);
1346 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1347
1348 if (before_inst)
1349 emit_before(before_block, before_inst, pull);
1350 else
1351 emit(pull);
1352
1353 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1354 dst,
1355 surf_index,
1356 header);
1357 pull->mlen = 2;
1358 pull->header_size = 1;
1359 } else if (devinfo->gen >= 7) {
1360 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1361
1362 grf_offset.type = offset_reg.type;
1363
1364 pull = MOV(grf_offset, offset_reg);
1365
1366 if (before_inst)
1367 emit_before(before_block, before_inst, pull);
1368 else
1369 emit(pull);
1370
1371 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1372 dst,
1373 surf_index,
1374 src_reg(grf_offset));
1375 pull->mlen = 1;
1376 } else {
1377 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1378 dst,
1379 surf_index,
1380 offset_reg);
1381 pull->base_mrf = 14;
1382 pull->mlen = 1;
1383 }
1384
1385 if (before_inst)
1386 emit_before(before_block, before_inst, pull);
1387 else
1388 emit(pull);
1389 }
1390
1391 src_reg
1392 vec4_visitor::emit_uniformize(const src_reg &src)
1393 {
1394 const src_reg chan_index(this, glsl_type::uint_type);
1395 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1396 src.type);
1397
1398 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1399 ->force_writemask_all = true;
1400 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1401 ->force_writemask_all = true;
1402
1403 return src_reg(dst);
1404 }
1405
1406 void
1407 vec4_visitor::visit(ir_expression *ir)
1408 {
1409 unsigned int operand;
1410 src_reg op[ARRAY_SIZE(ir->operands)];
1411 vec4_instruction *inst;
1412
1413 if (ir->operation == ir_binop_add) {
1414 if (try_emit_mad(ir))
1415 return;
1416 }
1417
1418 if (ir->operation == ir_unop_b2f) {
1419 if (try_emit_b2f_of_compare(ir))
1420 return;
1421 }
1422
1423 /* Storage for our result. Ideally for an assignment we'd be using
1424 * the actual storage for the result here, instead.
1425 */
1426 dst_reg result_dst(this, ir->type);
1427 src_reg result_src(result_dst);
1428
1429 if (ir->operation == ir_triop_csel) {
1430 ir->operands[1]->accept(this);
1431 op[1] = this->result;
1432 ir->operands[2]->accept(this);
1433 op[2] = this->result;
1434
1435 enum brw_predicate predicate;
1436 emit_bool_to_cond_code(ir->operands[0], &predicate);
1437 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1438 inst->predicate = predicate;
1439 this->result = result_src;
1440 return;
1441 }
1442
1443 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1444 this->result.file = BAD_FILE;
1445 ir->operands[operand]->accept(this);
1446 if (this->result.file == BAD_FILE) {
1447 fprintf(stderr, "Failed to get tree for expression operand:\n");
1448 ir->operands[operand]->fprint(stderr);
1449 exit(1);
1450 }
1451 op[operand] = this->result;
1452
1453 /* Matrix expression operands should have been broken down to vector
1454 * operations already.
1455 */
1456 assert(!ir->operands[operand]->type->is_matrix());
1457 }
1458
1459 /* If nothing special happens, this is the result. */
1460 this->result = result_src;
1461
1462 switch (ir->operation) {
1463 case ir_unop_logic_not:
1464 emit(NOT(result_dst, op[0]));
1465 break;
1466 case ir_unop_neg:
1467 op[0].negate = !op[0].negate;
1468 emit(MOV(result_dst, op[0]));
1469 break;
1470 case ir_unop_abs:
1471 op[0].abs = true;
1472 op[0].negate = false;
1473 emit(MOV(result_dst, op[0]));
1474 break;
1475
1476 case ir_unop_sign:
1477 if (ir->type->is_float()) {
1478 /* AND(val, 0x80000000) gives the sign bit.
1479 *
1480 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1481 * zero.
1482 */
1483 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1484
1485 op[0].type = BRW_REGISTER_TYPE_UD;
1486 result_dst.type = BRW_REGISTER_TYPE_UD;
1487 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1488
1489 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1490 inst->predicate = BRW_PREDICATE_NORMAL;
1491
1492 this->result.type = BRW_REGISTER_TYPE_F;
1493 } else {
1494 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1495 * -> non-negative val generates 0x00000000.
1496 * Predicated OR sets 1 if val is positive.
1497 */
1498 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1499
1500 emit(ASR(result_dst, op[0], src_reg(31)));
1501
1502 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1503 inst->predicate = BRW_PREDICATE_NORMAL;
1504 }
1505 break;
1506
1507 case ir_unop_rcp:
1508 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1509 break;
1510
1511 case ir_unop_exp2:
1512 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1513 break;
1514 case ir_unop_log2:
1515 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1516 break;
1517 case ir_unop_exp:
1518 case ir_unop_log:
1519 unreachable("not reached: should be handled by ir_explog_to_explog2");
1520 case ir_unop_sin:
1521 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1522 break;
1523 case ir_unop_cos:
1524 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1525 break;
1526
1527 case ir_unop_dFdx:
1528 case ir_unop_dFdx_coarse:
1529 case ir_unop_dFdx_fine:
1530 case ir_unop_dFdy:
1531 case ir_unop_dFdy_coarse:
1532 case ir_unop_dFdy_fine:
1533 unreachable("derivatives not valid in vertex shader");
1534
1535 case ir_unop_bitfield_reverse:
1536 emit(BFREV(result_dst, op[0]));
1537 break;
1538 case ir_unop_bit_count:
1539 emit(CBIT(result_dst, op[0]));
1540 break;
1541 case ir_unop_find_msb: {
1542 src_reg temp = src_reg(this, glsl_type::uint_type);
1543
1544 inst = emit(FBH(dst_reg(temp), op[0]));
1545 inst->dst.writemask = WRITEMASK_XYZW;
1546
1547 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1548 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1549 * subtract the result from 31 to convert the MSB count into an LSB count.
1550 */
1551
1552 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1553 temp.swizzle = BRW_SWIZZLE_NOOP;
1554 emit(MOV(result_dst, temp));
1555
1556 src_reg src_tmp = src_reg(result_dst);
1557 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1558
1559 src_tmp.negate = true;
1560 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1561 inst->predicate = BRW_PREDICATE_NORMAL;
1562 break;
1563 }
1564 case ir_unop_find_lsb:
1565 emit(FBL(result_dst, op[0]));
1566 break;
1567 case ir_unop_saturate:
1568 inst = emit(MOV(result_dst, op[0]));
1569 inst->saturate = true;
1570 break;
1571
1572 case ir_unop_noise:
1573 unreachable("not reached: should be handled by lower_noise");
1574
1575 case ir_unop_subroutine_to_int:
1576 emit(MOV(result_dst, op[0]));
1577 break;
1578
1579 case ir_binop_add:
1580 emit(ADD(result_dst, op[0], op[1]));
1581 break;
1582 case ir_binop_sub:
1583 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1584
1585 case ir_binop_mul:
1586 if (devinfo->gen < 8 && ir->type->is_integer()) {
1587 /* For integer multiplication, the MUL uses the low 16 bits of one of
1588 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1589 * accumulates in the contribution of the upper 16 bits of that
1590 * operand. If we can determine that one of the args is in the low
1591 * 16 bits, though, we can just emit a single MUL.
1592 */
1593 if (ir->operands[0]->is_uint16_constant()) {
1594 if (devinfo->gen < 7)
1595 emit(MUL(result_dst, op[0], op[1]));
1596 else
1597 emit(MUL(result_dst, op[1], op[0]));
1598 } else if (ir->operands[1]->is_uint16_constant()) {
1599 if (devinfo->gen < 7)
1600 emit(MUL(result_dst, op[1], op[0]));
1601 else
1602 emit(MUL(result_dst, op[0], op[1]));
1603 } else {
1604 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1605
1606 emit(MUL(acc, op[0], op[1]));
1607 emit(MACH(dst_null_d(), op[0], op[1]));
1608 emit(MOV(result_dst, src_reg(acc)));
1609 }
1610 } else {
1611 emit(MUL(result_dst, op[0], op[1]));
1612 }
1613 break;
1614 case ir_binop_imul_high: {
1615 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1616
1617 emit(MUL(acc, op[0], op[1]));
1618 emit(MACH(result_dst, op[0], op[1]));
1619 break;
1620 }
1621 case ir_binop_div:
1622 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1623 assert(ir->type->is_integer());
1624 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1625 break;
1626
1627 case ir_binop_carry:
1628 unreachable("Should have been lowered by carry_to_arith().");
1629
1630 case ir_binop_borrow:
1631 unreachable("Should have been lowered by borrow_to_arith().");
1632
1633 case ir_binop_mod:
1634 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1635 assert(ir->type->is_integer());
1636 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1637 break;
1638
1639 case ir_binop_less:
1640 case ir_binop_greater:
1641 case ir_binop_lequal:
1642 case ir_binop_gequal:
1643 case ir_binop_equal:
1644 case ir_binop_nequal: {
1645 if (devinfo->gen <= 5) {
1646 resolve_bool_comparison(ir->operands[0], &op[0]);
1647 resolve_bool_comparison(ir->operands[1], &op[1]);
1648 }
1649 emit(CMP(result_dst, op[0], op[1],
1650 brw_conditional_for_comparison(ir->operation)));
1651 break;
1652 }
1653
1654 case ir_binop_all_equal:
1655 if (devinfo->gen <= 5) {
1656 resolve_bool_comparison(ir->operands[0], &op[0]);
1657 resolve_bool_comparison(ir->operands[1], &op[1]);
1658 }
1659
1660 /* "==" operator producing a scalar boolean. */
1661 if (ir->operands[0]->type->is_vector() ||
1662 ir->operands[1]->type->is_vector()) {
1663 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1664 emit(MOV(result_dst, src_reg(0)));
1665 inst = emit(MOV(result_dst, src_reg(~0)));
1666 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1667 } else {
1668 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1669 }
1670 break;
1671 case ir_binop_any_nequal:
1672 if (devinfo->gen <= 5) {
1673 resolve_bool_comparison(ir->operands[0], &op[0]);
1674 resolve_bool_comparison(ir->operands[1], &op[1]);
1675 }
1676
1677 /* "!=" operator producing a scalar boolean. */
1678 if (ir->operands[0]->type->is_vector() ||
1679 ir->operands[1]->type->is_vector()) {
1680 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1681
1682 emit(MOV(result_dst, src_reg(0)));
1683 inst = emit(MOV(result_dst, src_reg(~0)));
1684 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1685 } else {
1686 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1687 }
1688 break;
1689
1690 case ir_unop_any:
1691 if (devinfo->gen <= 5) {
1692 resolve_bool_comparison(ir->operands[0], &op[0]);
1693 }
1694 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1695 emit(MOV(result_dst, src_reg(0)));
1696
1697 inst = emit(MOV(result_dst, src_reg(~0)));
1698 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1699 break;
1700
1701 case ir_binop_logic_xor:
1702 emit(XOR(result_dst, op[0], op[1]));
1703 break;
1704
1705 case ir_binop_logic_or:
1706 emit(OR(result_dst, op[0], op[1]));
1707 break;
1708
1709 case ir_binop_logic_and:
1710 emit(AND(result_dst, op[0], op[1]));
1711 break;
1712
1713 case ir_binop_dot:
1714 assert(ir->operands[0]->type->is_vector());
1715 assert(ir->operands[0]->type == ir->operands[1]->type);
1716 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1717 break;
1718
1719 case ir_unop_sqrt:
1720 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1721 break;
1722 case ir_unop_rsq:
1723 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1724 break;
1725
1726 case ir_unop_bitcast_i2f:
1727 case ir_unop_bitcast_u2f:
1728 this->result = op[0];
1729 this->result.type = BRW_REGISTER_TYPE_F;
1730 break;
1731
1732 case ir_unop_bitcast_f2i:
1733 this->result = op[0];
1734 this->result.type = BRW_REGISTER_TYPE_D;
1735 break;
1736
1737 case ir_unop_bitcast_f2u:
1738 this->result = op[0];
1739 this->result.type = BRW_REGISTER_TYPE_UD;
1740 break;
1741
1742 case ir_unop_i2f:
1743 case ir_unop_i2u:
1744 case ir_unop_u2i:
1745 case ir_unop_u2f:
1746 case ir_unop_f2i:
1747 case ir_unop_f2u:
1748 emit(MOV(result_dst, op[0]));
1749 break;
1750 case ir_unop_b2i:
1751 case ir_unop_b2f:
1752 if (devinfo->gen <= 5) {
1753 resolve_bool_comparison(ir->operands[0], &op[0]);
1754 }
1755 emit(MOV(result_dst, negate(op[0])));
1756 break;
1757 case ir_unop_f2b:
1758 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1759 break;
1760 case ir_unop_i2b:
1761 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1762 break;
1763
1764 case ir_unop_trunc:
1765 emit(RNDZ(result_dst, op[0]));
1766 break;
1767 case ir_unop_ceil: {
1768 src_reg tmp = src_reg(this, ir->type);
1769 op[0].negate = !op[0].negate;
1770 emit(RNDD(dst_reg(tmp), op[0]));
1771 tmp.negate = true;
1772 emit(MOV(result_dst, tmp));
1773 }
1774 break;
1775 case ir_unop_floor:
1776 inst = emit(RNDD(result_dst, op[0]));
1777 break;
1778 case ir_unop_fract:
1779 inst = emit(FRC(result_dst, op[0]));
1780 break;
1781 case ir_unop_round_even:
1782 emit(RNDE(result_dst, op[0]));
1783 break;
1784
1785 case ir_binop_min:
1786 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1787 break;
1788 case ir_binop_max:
1789 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1790 break;
1791
1792 case ir_binop_pow:
1793 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1794 break;
1795
1796 case ir_unop_bit_not:
1797 inst = emit(NOT(result_dst, op[0]));
1798 break;
1799 case ir_binop_bit_and:
1800 inst = emit(AND(result_dst, op[0], op[1]));
1801 break;
1802 case ir_binop_bit_xor:
1803 inst = emit(XOR(result_dst, op[0], op[1]));
1804 break;
1805 case ir_binop_bit_or:
1806 inst = emit(OR(result_dst, op[0], op[1]));
1807 break;
1808
1809 case ir_binop_lshift:
1810 inst = emit(SHL(result_dst, op[0], op[1]));
1811 break;
1812
1813 case ir_binop_rshift:
1814 if (ir->type->base_type == GLSL_TYPE_INT)
1815 inst = emit(ASR(result_dst, op[0], op[1]));
1816 else
1817 inst = emit(SHR(result_dst, op[0], op[1]));
1818 break;
1819
1820 case ir_binop_bfm:
1821 emit(BFI1(result_dst, op[0], op[1]));
1822 break;
1823
1824 case ir_binop_ubo_load: {
1825 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1826 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1827 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1828 src_reg offset;
1829
1830 /* Now, load the vector from that offset. */
1831 assert(ir->type->is_vector() || ir->type->is_scalar());
1832
1833 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1834 packed_consts.type = result.type;
1835 src_reg surf_index;
1836
1837 if (const_uniform_block) {
1838 /* The block index is a constant, so just emit the binding table entry
1839 * as an immediate.
1840 */
1841 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1842 const_uniform_block->value.u[0]);
1843 } else {
1844 /* The block index is not a constant. Evaluate the index expression
1845 * per-channel and add the base UBO index; we have to select a value
1846 * from any live channel.
1847 */
1848 surf_index = src_reg(this, glsl_type::uint_type);
1849 emit(ADD(dst_reg(surf_index), op[0],
1850 src_reg(prog_data->base.binding_table.ubo_start)));
1851 surf_index = emit_uniformize(surf_index);
1852
1853 /* Assume this may touch any UBO. It would be nice to provide
1854 * a tighter bound, but the array information is already lowered away.
1855 */
1856 brw_mark_surface_used(&prog_data->base,
1857 prog_data->base.binding_table.ubo_start +
1858 shader_prog->NumUniformBlocks - 1);
1859 }
1860
1861 if (const_offset_ir) {
1862 if (devinfo->gen >= 8) {
1863 /* Store the offset in a GRF so we can send-from-GRF. */
1864 offset = src_reg(this, glsl_type::int_type);
1865 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1866 } else {
1867 /* Immediates are fine on older generations since they'll be moved
1868 * to a (potentially fake) MRF at the generator level.
1869 */
1870 offset = src_reg(const_offset / 16);
1871 }
1872 } else {
1873 offset = src_reg(this, glsl_type::uint_type);
1874 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1875 }
1876
1877 emit_pull_constant_load_reg(dst_reg(packed_consts),
1878 surf_index,
1879 offset,
1880 NULL, NULL /* before_block/inst */);
1881
1882 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1883 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1884 const_offset % 16 / 4,
1885 const_offset % 16 / 4,
1886 const_offset % 16 / 4);
1887
1888 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1889 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1890 emit(CMP(result_dst, packed_consts, src_reg(0u),
1891 BRW_CONDITIONAL_NZ));
1892 } else {
1893 emit(MOV(result_dst, packed_consts));
1894 }
1895 break;
1896 }
1897
1898 case ir_binop_vector_extract:
1899 unreachable("should have been lowered by vec_index_to_cond_assign");
1900
1901 case ir_triop_fma:
1902 op[0] = fix_3src_operand(op[0]);
1903 op[1] = fix_3src_operand(op[1]);
1904 op[2] = fix_3src_operand(op[2]);
1905 /* Note that the instruction's argument order is reversed from GLSL
1906 * and the IR.
1907 */
1908 emit(MAD(result_dst, op[2], op[1], op[0]));
1909 break;
1910
1911 case ir_triop_lrp:
1912 emit_lrp(result_dst, op[0], op[1], op[2]);
1913 break;
1914
1915 case ir_triop_csel:
1916 unreachable("already handled above");
1917 break;
1918
1919 case ir_triop_bfi:
1920 op[0] = fix_3src_operand(op[0]);
1921 op[1] = fix_3src_operand(op[1]);
1922 op[2] = fix_3src_operand(op[2]);
1923 emit(BFI2(result_dst, op[0], op[1], op[2]));
1924 break;
1925
1926 case ir_triop_bitfield_extract:
1927 op[0] = fix_3src_operand(op[0]);
1928 op[1] = fix_3src_operand(op[1]);
1929 op[2] = fix_3src_operand(op[2]);
1930 /* Note that the instruction's argument order is reversed from GLSL
1931 * and the IR.
1932 */
1933 emit(BFE(result_dst, op[2], op[1], op[0]));
1934 break;
1935
1936 case ir_triop_vector_insert:
1937 unreachable("should have been lowered by lower_vector_insert");
1938
1939 case ir_quadop_bitfield_insert:
1940 unreachable("not reached: should be handled by "
1941 "bitfield_insert_to_bfm_bfi\n");
1942
1943 case ir_quadop_vector:
1944 unreachable("not reached: should be handled by lower_quadop_vector");
1945
1946 case ir_unop_pack_half_2x16:
1947 emit_pack_half_2x16(result_dst, op[0]);
1948 break;
1949 case ir_unop_unpack_half_2x16:
1950 emit_unpack_half_2x16(result_dst, op[0]);
1951 break;
1952 case ir_unop_unpack_unorm_4x8:
1953 emit_unpack_unorm_4x8(result_dst, op[0]);
1954 break;
1955 case ir_unop_unpack_snorm_4x8:
1956 emit_unpack_snorm_4x8(result_dst, op[0]);
1957 break;
1958 case ir_unop_pack_unorm_4x8:
1959 emit_pack_unorm_4x8(result_dst, op[0]);
1960 break;
1961 case ir_unop_pack_snorm_4x8:
1962 emit_pack_snorm_4x8(result_dst, op[0]);
1963 break;
1964 case ir_unop_pack_snorm_2x16:
1965 case ir_unop_pack_unorm_2x16:
1966 case ir_unop_unpack_snorm_2x16:
1967 case ir_unop_unpack_unorm_2x16:
1968 unreachable("not reached: should be handled by lower_packing_builtins");
1969 case ir_unop_unpack_half_2x16_split_x:
1970 case ir_unop_unpack_half_2x16_split_y:
1971 case ir_binop_pack_half_2x16_split:
1972 case ir_unop_interpolate_at_centroid:
1973 case ir_binop_interpolate_at_sample:
1974 case ir_binop_interpolate_at_offset:
1975 unreachable("not reached: should not occur in vertex shader");
1976 case ir_binop_ldexp:
1977 unreachable("not reached: should be handled by ldexp_to_arith()");
1978 case ir_unop_d2f:
1979 case ir_unop_f2d:
1980 case ir_unop_d2i:
1981 case ir_unop_i2d:
1982 case ir_unop_d2u:
1983 case ir_unop_u2d:
1984 case ir_unop_d2b:
1985 case ir_unop_pack_double_2x32:
1986 case ir_unop_unpack_double_2x32:
1987 case ir_unop_frexp_sig:
1988 case ir_unop_frexp_exp:
1989 unreachable("fp64 todo");
1990 }
1991 }
1992
1993
1994 void
1995 vec4_visitor::visit(ir_swizzle *ir)
1996 {
1997 /* Note that this is only swizzles in expressions, not those on the left
1998 * hand side of an assignment, which do write masking. See ir_assignment
1999 * for that.
2000 */
2001 const unsigned swz = brw_compose_swizzle(
2002 brw_swizzle_for_size(ir->type->vector_elements),
2003 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2004
2005 ir->val->accept(this);
2006 this->result = swizzle(this->result, swz);
2007 }
2008
2009 void
2010 vec4_visitor::visit(ir_dereference_variable *ir)
2011 {
2012 const struct glsl_type *type = ir->type;
2013 dst_reg *reg = variable_storage(ir->var);
2014
2015 if (!reg) {
2016 fail("Failed to find variable storage for %s\n", ir->var->name);
2017 this->result = src_reg(brw_null_reg());
2018 return;
2019 }
2020
2021 this->result = src_reg(*reg);
2022
2023 /* System values get their swizzle from the dst_reg writemask */
2024 if (ir->var->data.mode == ir_var_system_value)
2025 return;
2026
2027 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2028 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2029 }
2030
2031
2032 int
2033 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2034 {
2035 /* Under normal circumstances array elements are stored consecutively, so
2036 * the stride is equal to the size of the array element.
2037 */
2038 return type_size(ir->type);
2039 }
2040
2041
2042 void
2043 vec4_visitor::visit(ir_dereference_array *ir)
2044 {
2045 ir_constant *constant_index;
2046 src_reg src;
2047 int array_stride = compute_array_stride(ir);
2048
2049 constant_index = ir->array_index->constant_expression_value();
2050
2051 ir->array->accept(this);
2052 src = this->result;
2053
2054 if (constant_index) {
2055 src.reg_offset += constant_index->value.i[0] * array_stride;
2056 } else {
2057 /* Variable index array dereference. It eats the "vec4" of the
2058 * base of the array and an index that offsets the Mesa register
2059 * index.
2060 */
2061 ir->array_index->accept(this);
2062
2063 src_reg index_reg;
2064
2065 if (array_stride == 1) {
2066 index_reg = this->result;
2067 } else {
2068 index_reg = src_reg(this, glsl_type::int_type);
2069
2070 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2071 }
2072
2073 if (src.reladdr) {
2074 src_reg temp = src_reg(this, glsl_type::int_type);
2075
2076 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2077
2078 index_reg = temp;
2079 }
2080
2081 src.reladdr = ralloc(mem_ctx, src_reg);
2082 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2083 }
2084
2085 /* If the type is smaller than a vec4, replicate the last channel out. */
2086 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2087 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2088 else
2089 src.swizzle = BRW_SWIZZLE_NOOP;
2090 src.type = brw_type_for_base_type(ir->type);
2091
2092 this->result = src;
2093 }
2094
2095 void
2096 vec4_visitor::visit(ir_dereference_record *ir)
2097 {
2098 unsigned int i;
2099 const glsl_type *struct_type = ir->record->type;
2100 int offset = 0;
2101
2102 ir->record->accept(this);
2103
2104 for (i = 0; i < struct_type->length; i++) {
2105 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2106 break;
2107 offset += type_size(struct_type->fields.structure[i].type);
2108 }
2109
2110 /* If the type is smaller than a vec4, replicate the last channel out. */
2111 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2112 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2113 else
2114 this->result.swizzle = BRW_SWIZZLE_NOOP;
2115 this->result.type = brw_type_for_base_type(ir->type);
2116
2117 this->result.reg_offset += offset;
2118 }
2119
2120 /**
2121 * We want to be careful in assignment setup to hit the actual storage
2122 * instead of potentially using a temporary like we might with the
2123 * ir_dereference handler.
2124 */
2125 static dst_reg
2126 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2127 {
2128 /* The LHS must be a dereference. If the LHS is a variable indexed array
2129 * access of a vector, it must be separated into a series conditional moves
2130 * before reaching this point (see ir_vec_index_to_cond_assign).
2131 */
2132 assert(ir->as_dereference());
2133 ir_dereference_array *deref_array = ir->as_dereference_array();
2134 if (deref_array) {
2135 assert(!deref_array->array->type->is_vector());
2136 }
2137
2138 /* Use the rvalue deref handler for the most part. We'll ignore
2139 * swizzles in it and write swizzles using writemask, though.
2140 */
2141 ir->accept(v);
2142 return dst_reg(v->result);
2143 }
2144
2145 void
2146 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2147 const struct glsl_type *type,
2148 enum brw_predicate predicate)
2149 {
2150 if (type->base_type == GLSL_TYPE_STRUCT) {
2151 for (unsigned int i = 0; i < type->length; i++) {
2152 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2153 }
2154 return;
2155 }
2156
2157 if (type->is_array()) {
2158 for (unsigned int i = 0; i < type->length; i++) {
2159 emit_block_move(dst, src, type->fields.array, predicate);
2160 }
2161 return;
2162 }
2163
2164 if (type->is_matrix()) {
2165 const struct glsl_type *vec_type;
2166
2167 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2168 type->vector_elements, 1);
2169
2170 for (int i = 0; i < type->matrix_columns; i++) {
2171 emit_block_move(dst, src, vec_type, predicate);
2172 }
2173 return;
2174 }
2175
2176 assert(type->is_scalar() || type->is_vector());
2177
2178 dst->type = brw_type_for_base_type(type);
2179 src->type = dst->type;
2180
2181 dst->writemask = (1 << type->vector_elements) - 1;
2182
2183 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2184
2185 vec4_instruction *inst = emit(MOV(*dst, *src));
2186 inst->predicate = predicate;
2187
2188 dst->reg_offset++;
2189 src->reg_offset++;
2190 }
2191
2192
2193 /* If the RHS processing resulted in an instruction generating a
2194 * temporary value, and it would be easy to rewrite the instruction to
2195 * generate its result right into the LHS instead, do so. This ends
2196 * up reliably removing instructions where it can be tricky to do so
2197 * later without real UD chain information.
2198 */
2199 bool
2200 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2201 dst_reg dst,
2202 src_reg src,
2203 vec4_instruction *pre_rhs_inst,
2204 vec4_instruction *last_rhs_inst)
2205 {
2206 /* This could be supported, but it would take more smarts. */
2207 if (ir->condition)
2208 return false;
2209
2210 if (pre_rhs_inst == last_rhs_inst)
2211 return false; /* No instructions generated to work with. */
2212
2213 /* Make sure the last instruction generated our source reg. */
2214 if (src.file != GRF ||
2215 src.file != last_rhs_inst->dst.file ||
2216 src.reg != last_rhs_inst->dst.reg ||
2217 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2218 src.reladdr ||
2219 src.abs ||
2220 src.negate ||
2221 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2222 return false;
2223
2224 /* Check that that last instruction fully initialized the channels
2225 * we want to use, in the order we want to use them. We could
2226 * potentially reswizzle the operands of many instructions so that
2227 * we could handle out of order channels, but don't yet.
2228 */
2229
2230 for (unsigned i = 0; i < 4; i++) {
2231 if (dst.writemask & (1 << i)) {
2232 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2233 return false;
2234
2235 if (BRW_GET_SWZ(src.swizzle, i) != i)
2236 return false;
2237 }
2238 }
2239
2240 /* Success! Rewrite the instruction. */
2241 last_rhs_inst->dst.file = dst.file;
2242 last_rhs_inst->dst.reg = dst.reg;
2243 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2244 last_rhs_inst->dst.reladdr = dst.reladdr;
2245 last_rhs_inst->dst.writemask &= dst.writemask;
2246
2247 return true;
2248 }
2249
2250 void
2251 vec4_visitor::visit(ir_assignment *ir)
2252 {
2253 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2254 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2255
2256 if (!ir->lhs->type->is_scalar() &&
2257 !ir->lhs->type->is_vector()) {
2258 ir->rhs->accept(this);
2259 src_reg src = this->result;
2260
2261 if (ir->condition) {
2262 emit_bool_to_cond_code(ir->condition, &predicate);
2263 }
2264
2265 /* emit_block_move doesn't account for swizzles in the source register.
2266 * This should be ok, since the source register is a structure or an
2267 * array, and those can't be swizzled. But double-check to be sure.
2268 */
2269 assert(src.swizzle ==
2270 (ir->rhs->type->is_matrix()
2271 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2272 : BRW_SWIZZLE_NOOP));
2273
2274 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2275 return;
2276 }
2277
2278 /* Now we're down to just a scalar/vector with writemasks. */
2279 int i;
2280
2281 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2282 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2283
2284 ir->rhs->accept(this);
2285
2286 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2287
2288 int swizzles[4];
2289 int src_chan = 0;
2290
2291 assert(ir->lhs->type->is_vector() ||
2292 ir->lhs->type->is_scalar());
2293 dst.writemask = ir->write_mask;
2294
2295 /* Swizzle a small RHS vector into the channels being written.
2296 *
2297 * glsl ir treats write_mask as dictating how many channels are
2298 * present on the RHS while in our instructions we need to make
2299 * those channels appear in the slots of the vec4 they're written to.
2300 */
2301 for (int i = 0; i < 4; i++)
2302 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2303
2304 src_reg src = swizzle(this->result,
2305 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2306 swizzles[2], swizzles[3]));
2307
2308 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2309 return;
2310 }
2311
2312 if (ir->condition) {
2313 emit_bool_to_cond_code(ir->condition, &predicate);
2314 }
2315
2316 for (i = 0; i < type_size(ir->lhs->type); i++) {
2317 vec4_instruction *inst = emit(MOV(dst, src));
2318 inst->predicate = predicate;
2319
2320 dst.reg_offset++;
2321 src.reg_offset++;
2322 }
2323 }
2324
2325 void
2326 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2327 {
2328 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2329 foreach_in_list(ir_constant, field_value, &ir->components) {
2330 emit_constant_values(dst, field_value);
2331 }
2332 return;
2333 }
2334
2335 if (ir->type->is_array()) {
2336 for (unsigned int i = 0; i < ir->type->length; i++) {
2337 emit_constant_values(dst, ir->array_elements[i]);
2338 }
2339 return;
2340 }
2341
2342 if (ir->type->is_matrix()) {
2343 for (int i = 0; i < ir->type->matrix_columns; i++) {
2344 float *vec = &ir->value.f[i * ir->type->vector_elements];
2345
2346 for (int j = 0; j < ir->type->vector_elements; j++) {
2347 dst->writemask = 1 << j;
2348 dst->type = BRW_REGISTER_TYPE_F;
2349
2350 emit(MOV(*dst, src_reg(vec[j])));
2351 }
2352 dst->reg_offset++;
2353 }
2354 return;
2355 }
2356
2357 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2358
2359 for (int i = 0; i < ir->type->vector_elements; i++) {
2360 if (!(remaining_writemask & (1 << i)))
2361 continue;
2362
2363 dst->writemask = 1 << i;
2364 dst->type = brw_type_for_base_type(ir->type);
2365
2366 /* Find other components that match the one we're about to
2367 * write. Emits fewer instructions for things like vec4(0.5,
2368 * 1.5, 1.5, 1.5).
2369 */
2370 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2371 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2372 if (ir->value.b[i] == ir->value.b[j])
2373 dst->writemask |= (1 << j);
2374 } else {
2375 /* u, i, and f storage all line up, so no need for a
2376 * switch case for comparing each type.
2377 */
2378 if (ir->value.u[i] == ir->value.u[j])
2379 dst->writemask |= (1 << j);
2380 }
2381 }
2382
2383 switch (ir->type->base_type) {
2384 case GLSL_TYPE_FLOAT:
2385 emit(MOV(*dst, src_reg(ir->value.f[i])));
2386 break;
2387 case GLSL_TYPE_INT:
2388 emit(MOV(*dst, src_reg(ir->value.i[i])));
2389 break;
2390 case GLSL_TYPE_UINT:
2391 emit(MOV(*dst, src_reg(ir->value.u[i])));
2392 break;
2393 case GLSL_TYPE_BOOL:
2394 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2395 break;
2396 default:
2397 unreachable("Non-float/uint/int/bool constant");
2398 }
2399
2400 remaining_writemask &= ~dst->writemask;
2401 }
2402 dst->reg_offset++;
2403 }
2404
2405 void
2406 vec4_visitor::visit(ir_constant *ir)
2407 {
2408 dst_reg dst = dst_reg(this, ir->type);
2409 this->result = src_reg(dst);
2410
2411 emit_constant_values(&dst, ir);
2412 }
2413
2414 void
2415 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2416 {
2417 ir_dereference *deref = static_cast<ir_dereference *>(
2418 ir->actual_parameters.get_head());
2419 ir_variable *location = deref->variable_referenced();
2420 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2421 location->data.binding);
2422
2423 /* Calculate the surface offset */
2424 src_reg offset(this, glsl_type::uint_type);
2425 ir_dereference_array *deref_array = deref->as_dereference_array();
2426 if (deref_array) {
2427 deref_array->array_index->accept(this);
2428
2429 src_reg tmp(this, glsl_type::uint_type);
2430 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2431 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2432 } else {
2433 offset = location->data.atomic.offset;
2434 }
2435
2436 /* Emit the appropriate machine instruction */
2437 const char *callee = ir->callee->function_name();
2438 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2439
2440 if (!strcmp("__intrinsic_atomic_read", callee)) {
2441 emit_untyped_surface_read(surf_index, dst, offset);
2442
2443 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2444 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2445 src_reg(), src_reg());
2446
2447 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2448 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2449 src_reg(), src_reg());
2450 }
2451
2452 brw_mark_surface_used(stage_prog_data, surf_index);
2453 }
2454
2455 void
2456 vec4_visitor::visit(ir_call *ir)
2457 {
2458 const char *callee = ir->callee->function_name();
2459
2460 if (!strcmp("__intrinsic_atomic_read", callee) ||
2461 !strcmp("__intrinsic_atomic_increment", callee) ||
2462 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2463 visit_atomic_counter_intrinsic(ir);
2464 } else {
2465 unreachable("Unsupported intrinsic.");
2466 }
2467 }
2468
2469 src_reg
2470 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2471 {
2472 vec4_instruction *inst =
2473 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2474 dst_reg(this, glsl_type::uvec4_type));
2475 inst->base_mrf = 2;
2476 inst->src[1] = sampler;
2477
2478 int param_base;
2479
2480 if (devinfo->gen >= 9) {
2481 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2482 vec4_instruction *header_inst = new(mem_ctx)
2483 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2484 dst_reg(MRF, inst->base_mrf));
2485
2486 emit(header_inst);
2487
2488 inst->mlen = 2;
2489 inst->header_size = 1;
2490 param_base = inst->base_mrf + 1;
2491 } else {
2492 inst->mlen = 1;
2493 param_base = inst->base_mrf;
2494 }
2495
2496 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2497 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2498 int zero_mask = 0xf & ~coord_mask;
2499
2500 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2501 coordinate));
2502
2503 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2504 src_reg(0)));
2505
2506 emit(inst);
2507 return src_reg(inst->dst);
2508 }
2509
2510 static bool
2511 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2512 {
2513 if (devinfo->gen < 8 && !devinfo->is_haswell)
2514 return false;
2515
2516 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2517 }
2518
2519 void
2520 vec4_visitor::visit(ir_texture *ir)
2521 {
2522 uint32_t sampler =
2523 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2524
2525 ir_rvalue *nonconst_sampler_index =
2526 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2527
2528 /* Handle non-constant sampler array indexing */
2529 src_reg sampler_reg;
2530 if (nonconst_sampler_index) {
2531 /* The highest sampler which may be used by this operation is
2532 * the last element of the array. Mark it here, because the generator
2533 * doesn't have enough information to determine the bound.
2534 */
2535 uint32_t array_size = ir->sampler->as_dereference_array()
2536 ->array->type->array_size();
2537
2538 uint32_t max_used = sampler + array_size - 1;
2539 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2540 max_used += prog_data->base.binding_table.gather_texture_start;
2541 } else {
2542 max_used += prog_data->base.binding_table.texture_start;
2543 }
2544
2545 brw_mark_surface_used(&prog_data->base, max_used);
2546
2547 /* Emit code to evaluate the actual indexing expression */
2548 nonconst_sampler_index->accept(this);
2549 src_reg temp(this, glsl_type::uint_type);
2550 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2551 sampler_reg = emit_uniformize(temp);
2552 } else {
2553 /* Single sampler, or constant array index; the indexing expression
2554 * is just an immediate.
2555 */
2556 sampler_reg = src_reg(sampler);
2557 }
2558
2559 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2560 * emitting anything other than setting up the constant result.
2561 */
2562 if (ir->op == ir_tg4) {
2563 ir_constant *chan = ir->lod_info.component->as_constant();
2564 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2565 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2566 dst_reg result(this, ir->type);
2567 this->result = src_reg(result);
2568 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2569 return;
2570 }
2571 }
2572
2573 /* Should be lowered by do_lower_texture_projection */
2574 assert(!ir->projector);
2575
2576 /* Should be lowered */
2577 assert(!ir->offset || !ir->offset->type->is_array());
2578
2579 /* Generate code to compute all the subexpression trees. This has to be
2580 * done before loading any values into MRFs for the sampler message since
2581 * generating these values may involve SEND messages that need the MRFs.
2582 */
2583 src_reg coordinate;
2584 if (ir->coordinate) {
2585 ir->coordinate->accept(this);
2586 coordinate = this->result;
2587 }
2588
2589 src_reg shadow_comparitor;
2590 if (ir->shadow_comparitor) {
2591 ir->shadow_comparitor->accept(this);
2592 shadow_comparitor = this->result;
2593 }
2594
2595 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2596 src_reg offset_value;
2597 if (has_nonconstant_offset) {
2598 ir->offset->accept(this);
2599 offset_value = src_reg(this->result);
2600 }
2601
2602 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2603 src_reg lod, dPdx, dPdy, sample_index, mcs;
2604 switch (ir->op) {
2605 case ir_tex:
2606 lod = src_reg(0.0f);
2607 lod_type = glsl_type::float_type;
2608 break;
2609 case ir_txf:
2610 case ir_txl:
2611 case ir_txs:
2612 ir->lod_info.lod->accept(this);
2613 lod = this->result;
2614 lod_type = ir->lod_info.lod->type;
2615 break;
2616 case ir_query_levels:
2617 lod = src_reg(0);
2618 lod_type = glsl_type::int_type;
2619 break;
2620 case ir_txf_ms:
2621 ir->lod_info.sample_index->accept(this);
2622 sample_index = this->result;
2623 sample_index_type = ir->lod_info.sample_index->type;
2624
2625 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2626 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2627 else
2628 mcs = src_reg(0u);
2629 break;
2630 case ir_txd:
2631 ir->lod_info.grad.dPdx->accept(this);
2632 dPdx = this->result;
2633
2634 ir->lod_info.grad.dPdy->accept(this);
2635 dPdy = this->result;
2636
2637 lod_type = ir->lod_info.grad.dPdx->type;
2638 break;
2639 case ir_txb:
2640 case ir_lod:
2641 case ir_tg4:
2642 break;
2643 }
2644
2645 enum opcode opcode;
2646 switch (ir->op) {
2647 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2648 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2649 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2650 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2651 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2652 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2653 case ir_tg4: opcode = has_nonconstant_offset
2654 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2655 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2656 case ir_txb:
2657 unreachable("TXB is not valid for vertex shaders.");
2658 case ir_lod:
2659 unreachable("LOD is not valid for vertex shaders.");
2660 default:
2661 unreachable("Unrecognized tex op");
2662 }
2663
2664 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2665 opcode, dst_reg(this, ir->type));
2666
2667 if (ir->offset != NULL && !has_nonconstant_offset) {
2668 inst->offset =
2669 brw_texture_offset(ir->offset->as_constant()->value.i,
2670 ir->offset->type->vector_elements);
2671 }
2672
2673 /* Stuff the channel select bits in the top of the texture offset */
2674 if (ir->op == ir_tg4)
2675 inst->offset |= gather_channel(ir, sampler) << 16;
2676
2677 /* The message header is necessary for:
2678 * - Gen4 (always)
2679 * - Gen9+ for selecting SIMD4x2
2680 * - Texel offsets
2681 * - Gather channel selection
2682 * - Sampler indices too large to fit in a 4-bit value.
2683 */
2684 inst->header_size =
2685 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2686 inst->offset != 0 || ir->op == ir_tg4 ||
2687 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2688 inst->base_mrf = 2;
2689 inst->mlen = inst->header_size + 1; /* always at least one */
2690 inst->dst.writemask = WRITEMASK_XYZW;
2691 inst->shadow_compare = ir->shadow_comparitor != NULL;
2692
2693 inst->src[1] = sampler_reg;
2694
2695 /* MRF for the first parameter */
2696 int param_base = inst->base_mrf + inst->header_size;
2697
2698 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2699 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2700 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2701 } else {
2702 /* Load the coordinate */
2703 /* FINISHME: gl_clamp_mask and saturate */
2704 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2705 int zero_mask = 0xf & ~coord_mask;
2706
2707 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2708 coordinate));
2709
2710 if (zero_mask != 0) {
2711 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2712 src_reg(0)));
2713 }
2714 /* Load the shadow comparitor */
2715 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2716 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2717 WRITEMASK_X),
2718 shadow_comparitor));
2719 inst->mlen++;
2720 }
2721
2722 /* Load the LOD info */
2723 if (ir->op == ir_tex || ir->op == ir_txl) {
2724 int mrf, writemask;
2725 if (devinfo->gen >= 5) {
2726 mrf = param_base + 1;
2727 if (ir->shadow_comparitor) {
2728 writemask = WRITEMASK_Y;
2729 /* mlen already incremented */
2730 } else {
2731 writemask = WRITEMASK_X;
2732 inst->mlen++;
2733 }
2734 } else /* devinfo->gen == 4 */ {
2735 mrf = param_base;
2736 writemask = WRITEMASK_W;
2737 }
2738 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2739 } else if (ir->op == ir_txf) {
2740 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2741 } else if (ir->op == ir_txf_ms) {
2742 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2743 sample_index));
2744 if (devinfo->gen >= 7) {
2745 /* MCS data is in the first channel of `mcs`, but we need to get it into
2746 * the .y channel of the second vec4 of params, so replicate .x across
2747 * the whole vec4 and then mask off everything except .y
2748 */
2749 mcs.swizzle = BRW_SWIZZLE_XXXX;
2750 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2751 mcs));
2752 }
2753 inst->mlen++;
2754 } else if (ir->op == ir_txd) {
2755 const glsl_type *type = lod_type;
2756
2757 if (devinfo->gen >= 5) {
2758 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2759 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2760 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2761 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2762 inst->mlen++;
2763
2764 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2765 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2766 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2767 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2768 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2769 inst->mlen++;
2770
2771 if (ir->shadow_comparitor) {
2772 emit(MOV(dst_reg(MRF, param_base + 2,
2773 ir->shadow_comparitor->type, WRITEMASK_Z),
2774 shadow_comparitor));
2775 }
2776 }
2777 } else /* devinfo->gen == 4 */ {
2778 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2779 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2780 inst->mlen += 2;
2781 }
2782 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2783 if (ir->shadow_comparitor) {
2784 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2785 shadow_comparitor));
2786 }
2787
2788 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2789 offset_value));
2790 inst->mlen++;
2791 }
2792 }
2793
2794 emit(inst);
2795
2796 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2797 * spec requires layers.
2798 */
2799 if (ir->op == ir_txs) {
2800 glsl_type const *type = ir->sampler->type;
2801 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2802 type->sampler_array) {
2803 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2804 writemask(inst->dst, WRITEMASK_Z),
2805 src_reg(inst->dst), src_reg(6));
2806 }
2807 }
2808
2809 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2810 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2811 }
2812
2813 swizzle_result(ir, src_reg(inst->dst), sampler);
2814 }
2815
2816 /**
2817 * Apply workarounds for Gen6 gather with UINT/SINT
2818 */
2819 void
2820 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2821 {
2822 if (!wa)
2823 return;
2824
2825 int width = (wa & WA_8BIT) ? 8 : 16;
2826 dst_reg dst_f = dst;
2827 dst_f.type = BRW_REGISTER_TYPE_F;
2828
2829 /* Convert from UNORM to UINT */
2830 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2831 emit(MOV(dst, src_reg(dst_f)));
2832
2833 if (wa & WA_SIGN) {
2834 /* Reinterpret the UINT value as a signed INT value by
2835 * shifting the sign bit into place, then shifting back
2836 * preserving sign.
2837 */
2838 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2839 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2840 }
2841 }
2842
2843 /**
2844 * Set up the gather channel based on the swizzle, for gather4.
2845 */
2846 uint32_t
2847 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2848 {
2849 ir_constant *chan = ir->lod_info.component->as_constant();
2850 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2851 switch (swiz) {
2852 case SWIZZLE_X: return 0;
2853 case SWIZZLE_Y:
2854 /* gather4 sampler is broken for green channel on RG32F --
2855 * we must ask for blue instead.
2856 */
2857 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2858 return 2;
2859 return 1;
2860 case SWIZZLE_Z: return 2;
2861 case SWIZZLE_W: return 3;
2862 default:
2863 unreachable("Not reached"); /* zero, one swizzles handled already */
2864 }
2865 }
2866
2867 void
2868 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2869 {
2870 int s = key->tex.swizzles[sampler];
2871
2872 this->result = src_reg(this, ir->type);
2873 dst_reg swizzled_result(this->result);
2874
2875 if (ir->op == ir_query_levels) {
2876 /* # levels is in .w */
2877 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2878 emit(MOV(swizzled_result, orig_val));
2879 return;
2880 }
2881
2882 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2883 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2884 emit(MOV(swizzled_result, orig_val));
2885 return;
2886 }
2887
2888
2889 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2890 int swizzle[4] = {0};
2891
2892 for (int i = 0; i < 4; i++) {
2893 switch (GET_SWZ(s, i)) {
2894 case SWIZZLE_ZERO:
2895 zero_mask |= (1 << i);
2896 break;
2897 case SWIZZLE_ONE:
2898 one_mask |= (1 << i);
2899 break;
2900 default:
2901 copy_mask |= (1 << i);
2902 swizzle[i] = GET_SWZ(s, i);
2903 break;
2904 }
2905 }
2906
2907 if (copy_mask) {
2908 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2909 swizzled_result.writemask = copy_mask;
2910 emit(MOV(swizzled_result, orig_val));
2911 }
2912
2913 if (zero_mask) {
2914 swizzled_result.writemask = zero_mask;
2915 emit(MOV(swizzled_result, src_reg(0.0f)));
2916 }
2917
2918 if (one_mask) {
2919 swizzled_result.writemask = one_mask;
2920 emit(MOV(swizzled_result, src_reg(1.0f)));
2921 }
2922 }
2923
2924 void
2925 vec4_visitor::visit(ir_return *)
2926 {
2927 unreachable("not reached");
2928 }
2929
2930 void
2931 vec4_visitor::visit(ir_discard *)
2932 {
2933 unreachable("not reached");
2934 }
2935
2936 void
2937 vec4_visitor::visit(ir_if *ir)
2938 {
2939 /* Don't point the annotation at the if statement, because then it plus
2940 * the then and else blocks get printed.
2941 */
2942 this->base_ir = ir->condition;
2943
2944 if (devinfo->gen == 6) {
2945 emit_if_gen6(ir);
2946 } else {
2947 enum brw_predicate predicate;
2948 emit_bool_to_cond_code(ir->condition, &predicate);
2949 emit(IF(predicate));
2950 }
2951
2952 visit_instructions(&ir->then_instructions);
2953
2954 if (!ir->else_instructions.is_empty()) {
2955 this->base_ir = ir->condition;
2956 emit(BRW_OPCODE_ELSE);
2957
2958 visit_instructions(&ir->else_instructions);
2959 }
2960
2961 this->base_ir = ir->condition;
2962 emit(BRW_OPCODE_ENDIF);
2963 }
2964
2965 void
2966 vec4_visitor::visit(ir_emit_vertex *)
2967 {
2968 unreachable("not reached");
2969 }
2970
2971 void
2972 vec4_visitor::visit(ir_end_primitive *)
2973 {
2974 unreachable("not reached");
2975 }
2976
2977 void
2978 vec4_visitor::visit(ir_barrier *)
2979 {
2980 unreachable("not reached");
2981 }
2982
2983 void
2984 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2985 dst_reg dst, src_reg offset,
2986 src_reg src0, src_reg src1)
2987 {
2988 unsigned mlen = 0;
2989
2990 /* Set the atomic operation offset. */
2991 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2992 mlen++;
2993
2994 /* Set the atomic operation arguments. */
2995 if (src0.file != BAD_FILE) {
2996 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2997 mlen++;
2998 }
2999
3000 if (src1.file != BAD_FILE) {
3001 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3002 mlen++;
3003 }
3004
3005 /* Emit the instruction. Note that this maps to the normal SIMD8
3006 * untyped atomic message on Ivy Bridge, but that's OK because
3007 * unused channels will be masked out.
3008 */
3009 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3010 brw_message_reg(0),
3011 src_reg(surf_index), src_reg(atomic_op));
3012 inst->mlen = mlen;
3013 }
3014
3015 void
3016 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3017 src_reg offset)
3018 {
3019 /* Set the surface read offset. */
3020 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3021
3022 /* Emit the instruction. Note that this maps to the normal SIMD8
3023 * untyped surface read message, but that's OK because unused
3024 * channels will be masked out.
3025 */
3026 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3027 brw_message_reg(0),
3028 src_reg(surf_index), src_reg(1));
3029 inst->mlen = 1;
3030 }
3031
3032 void
3033 vec4_visitor::emit_ndc_computation()
3034 {
3035 /* Get the position */
3036 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3037
3038 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3039 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3040 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3041
3042 current_annotation = "NDC";
3043 dst_reg ndc_w = ndc;
3044 ndc_w.writemask = WRITEMASK_W;
3045 src_reg pos_w = pos;
3046 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3047 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3048
3049 dst_reg ndc_xyz = ndc;
3050 ndc_xyz.writemask = WRITEMASK_XYZ;
3051
3052 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3053 }
3054
3055 void
3056 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3057 {
3058 if (devinfo->gen < 6 &&
3059 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3060 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3061 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3062 dst_reg header1_w = header1;
3063 header1_w.writemask = WRITEMASK_W;
3064
3065 emit(MOV(header1, 0u));
3066
3067 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3068 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3069
3070 current_annotation = "Point size";
3071 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3072 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3073 }
3074
3075 if (key->userclip_active) {
3076 current_annotation = "Clipping flags";
3077 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3078 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3079
3080 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3081 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3082 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3083
3084 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3085 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3086 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3087 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3088 }
3089
3090 /* i965 clipping workaround:
3091 * 1) Test for -ve rhw
3092 * 2) If set,
3093 * set ndc = (0,0,0,0)
3094 * set ucp[6] = 1
3095 *
3096 * Later, clipping will detect ucp[6] and ensure the primitive is
3097 * clipped against all fixed planes.
3098 */
3099 if (devinfo->has_negative_rhw_bug) {
3100 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3101 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3102 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3103 vec4_instruction *inst;
3104 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3105 inst->predicate = BRW_PREDICATE_NORMAL;
3106 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3107 inst->predicate = BRW_PREDICATE_NORMAL;
3108 }
3109
3110 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3111 } else if (devinfo->gen < 6) {
3112 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3113 } else {
3114 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3115 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3116 dst_reg reg_w = reg;
3117 reg_w.writemask = WRITEMASK_W;
3118 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3119 }
3120 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3121 dst_reg reg_y = reg;
3122 reg_y.writemask = WRITEMASK_Y;
3123 reg_y.type = BRW_REGISTER_TYPE_D;
3124 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3125 }
3126 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3127 dst_reg reg_z = reg;
3128 reg_z.writemask = WRITEMASK_Z;
3129 reg_z.type = BRW_REGISTER_TYPE_D;
3130 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3131 }
3132 }
3133 }
3134
3135 void
3136 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3137 {
3138 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3139 *
3140 * "If a linked set of shaders forming the vertex stage contains no
3141 * static write to gl_ClipVertex or gl_ClipDistance, but the
3142 * application has requested clipping against user clip planes through
3143 * the API, then the coordinate written to gl_Position is used for
3144 * comparison against the user clip planes."
3145 *
3146 * This function is only called if the shader didn't write to
3147 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3148 * if the user wrote to it; otherwise we use gl_Position.
3149 */
3150 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3151 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3152 clip_vertex = VARYING_SLOT_POS;
3153 }
3154
3155 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3156 ++i) {
3157 reg.writemask = 1 << i;
3158 emit(DP4(reg,
3159 src_reg(output_reg[clip_vertex]),
3160 src_reg(this->userplane[i + offset])));
3161 }
3162 }
3163
3164 vec4_instruction *
3165 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3166 {
3167 assert (varying < VARYING_SLOT_MAX);
3168 reg.type = output_reg[varying].type;
3169 current_annotation = output_reg_annotation[varying];
3170 /* Copy the register, saturating if necessary */
3171 return emit(MOV(reg, src_reg(output_reg[varying])));
3172 }
3173
3174 void
3175 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3176 {
3177 reg.type = BRW_REGISTER_TYPE_F;
3178
3179 switch (varying) {
3180 case VARYING_SLOT_PSIZ:
3181 {
3182 /* PSIZ is always in slot 0, and is coupled with other flags. */
3183 current_annotation = "indices, point width, clip flags";
3184 emit_psiz_and_flags(reg);
3185 break;
3186 }
3187 case BRW_VARYING_SLOT_NDC:
3188 current_annotation = "NDC";
3189 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3190 break;
3191 case VARYING_SLOT_POS:
3192 current_annotation = "gl_Position";
3193 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3194 break;
3195 case VARYING_SLOT_EDGE:
3196 /* This is present when doing unfilled polygons. We're supposed to copy
3197 * the edge flag from the user-provided vertex array
3198 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3199 * of that attribute (starts as 1.0f). This is then used in clipping to
3200 * determine which edges should be drawn as wireframe.
3201 */
3202 current_annotation = "edge flag";
3203 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3204 glsl_type::float_type, WRITEMASK_XYZW))));
3205 break;
3206 case BRW_VARYING_SLOT_PAD:
3207 /* No need to write to this slot */
3208 break;
3209 case VARYING_SLOT_COL0:
3210 case VARYING_SLOT_COL1:
3211 case VARYING_SLOT_BFC0:
3212 case VARYING_SLOT_BFC1: {
3213 /* These built-in varyings are only supported in compatibility mode,
3214 * and we only support GS in core profile. So, this must be a vertex
3215 * shader.
3216 */
3217 assert(stage == MESA_SHADER_VERTEX);
3218 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3219 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3220 inst->saturate = true;
3221 break;
3222 }
3223
3224 default:
3225 emit_generic_urb_slot(reg, varying);
3226 break;
3227 }
3228 }
3229
3230 static int
3231 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3232 {
3233 if (devinfo->gen >= 6) {
3234 /* URB data written (does not include the message header reg) must
3235 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3236 * section 5.4.3.2.2: URB_INTERLEAVED.
3237 *
3238 * URB entries are allocated on a multiple of 1024 bits, so an
3239 * extra 128 bits written here to make the end align to 256 is
3240 * no problem.
3241 */
3242 if ((mlen % 2) != 1)
3243 mlen++;
3244 }
3245
3246 return mlen;
3247 }
3248
3249
3250 /**
3251 * Generates the VUE payload plus the necessary URB write instructions to
3252 * output it.
3253 *
3254 * The VUE layout is documented in Volume 2a.
3255 */
3256 void
3257 vec4_visitor::emit_vertex()
3258 {
3259 /* MRF 0 is reserved for the debugger, so start with message header
3260 * in MRF 1.
3261 */
3262 int base_mrf = 1;
3263 int mrf = base_mrf;
3264 /* In the process of generating our URB write message contents, we
3265 * may need to unspill a register or load from an array. Those
3266 * reads would use MRFs 14-15.
3267 */
3268 int max_usable_mrf = 13;
3269
3270 /* The following assertion verifies that max_usable_mrf causes an
3271 * even-numbered amount of URB write data, which will meet gen6's
3272 * requirements for length alignment.
3273 */
3274 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3275
3276 /* First mrf is the g0-based message header containing URB handles and
3277 * such.
3278 */
3279 emit_urb_write_header(mrf++);
3280
3281 if (devinfo->gen < 6) {
3282 emit_ndc_computation();
3283 }
3284
3285 /* Lower legacy ff and ClipVertex clipping to clip distances */
3286 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3287 current_annotation = "user clip distances";
3288
3289 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3290 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3291
3292 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3293 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3294 }
3295
3296 /* We may need to split this up into several URB writes, so do them in a
3297 * loop.
3298 */
3299 int slot = 0;
3300 bool complete = false;
3301 do {
3302 /* URB offset is in URB row increments, and each of our MRFs is half of
3303 * one of those, since we're doing interleaved writes.
3304 */
3305 int offset = slot / 2;
3306
3307 mrf = base_mrf + 1;
3308 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3309 emit_urb_slot(dst_reg(MRF, mrf++),
3310 prog_data->vue_map.slot_to_varying[slot]);
3311
3312 /* If this was max_usable_mrf, we can't fit anything more into this
3313 * URB WRITE.
3314 */
3315 if (mrf > max_usable_mrf) {
3316 slot++;
3317 break;
3318 }
3319 }
3320
3321 complete = slot >= prog_data->vue_map.num_slots;
3322 current_annotation = "URB write";
3323 vec4_instruction *inst = emit_urb_write_opcode(complete);
3324 inst->base_mrf = base_mrf;
3325 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3326 inst->offset += offset;
3327 } while(!complete);
3328 }
3329
3330
3331 src_reg
3332 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3333 src_reg *reladdr, int reg_offset)
3334 {
3335 /* Because we store the values to scratch interleaved like our
3336 * vertex data, we need to scale the vec4 index by 2.
3337 */
3338 int message_header_scale = 2;
3339
3340 /* Pre-gen6, the message header uses byte offsets instead of vec4
3341 * (16-byte) offset units.
3342 */
3343 if (devinfo->gen < 6)
3344 message_header_scale *= 16;
3345
3346 if (reladdr) {
3347 src_reg index = src_reg(this, glsl_type::int_type);
3348
3349 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3350 src_reg(reg_offset)));
3351 emit_before(block, inst, MUL(dst_reg(index), index,
3352 src_reg(message_header_scale)));
3353
3354 return index;
3355 } else {
3356 return src_reg(reg_offset * message_header_scale);
3357 }
3358 }
3359
3360 src_reg
3361 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3362 src_reg *reladdr, int reg_offset)
3363 {
3364 if (reladdr) {
3365 src_reg index = src_reg(this, glsl_type::int_type);
3366
3367 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3368 src_reg(reg_offset)));
3369
3370 /* Pre-gen6, the message header uses byte offsets instead of vec4
3371 * (16-byte) offset units.
3372 */
3373 if (devinfo->gen < 6) {
3374 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3375 }
3376
3377 return index;
3378 } else if (devinfo->gen >= 8) {
3379 /* Store the offset in a GRF so we can send-from-GRF. */
3380 src_reg offset = src_reg(this, glsl_type::int_type);
3381 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3382 return offset;
3383 } else {
3384 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3385 return src_reg(reg_offset * message_header_scale);
3386 }
3387 }
3388
3389 /**
3390 * Emits an instruction before @inst to load the value named by @orig_src
3391 * from scratch space at @base_offset to @temp.
3392 *
3393 * @base_offset is measured in 32-byte units (the size of a register).
3394 */
3395 void
3396 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3397 dst_reg temp, src_reg orig_src,
3398 int base_offset)
3399 {
3400 int reg_offset = base_offset + orig_src.reg_offset;
3401 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3402 reg_offset);
3403
3404 emit_before(block, inst, SCRATCH_READ(temp, index));
3405 }
3406
3407 /**
3408 * Emits an instruction after @inst to store the value to be written
3409 * to @orig_dst to scratch space at @base_offset, from @temp.
3410 *
3411 * @base_offset is measured in 32-byte units (the size of a register).
3412 */
3413 void
3414 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3415 int base_offset)
3416 {
3417 int reg_offset = base_offset + inst->dst.reg_offset;
3418 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3419 reg_offset);
3420
3421 /* Create a temporary register to store *inst's result in.
3422 *
3423 * We have to be careful in MOVing from our temporary result register in
3424 * the scratch write. If we swizzle from channels of the temporary that
3425 * weren't initialized, it will confuse live interval analysis, which will
3426 * make spilling fail to make progress.
3427 */
3428 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3429 inst->dst.type),
3430 brw_swizzle_for_mask(inst->dst.writemask));
3431 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3432 inst->dst.writemask));
3433 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3434 write->predicate = inst->predicate;
3435 write->ir = inst->ir;
3436 write->annotation = inst->annotation;
3437 inst->insert_after(block, write);
3438
3439 inst->dst.file = temp.file;
3440 inst->dst.reg = temp.reg;
3441 inst->dst.reg_offset = temp.reg_offset;
3442 inst->dst.reladdr = NULL;
3443 }
3444
3445 /**
3446 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3447 * adds the scratch read(s) before \p inst. The function also checks for
3448 * recursive reladdr scratch accesses, issuing the corresponding scratch
3449 * loads and rewriting reladdr references accordingly.
3450 *
3451 * \return \p src if it did not require a scratch load, otherwise, the
3452 * register holding the result of the scratch load that the caller should
3453 * use to rewrite src.
3454 */
3455 src_reg
3456 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3457 vec4_instruction *inst, src_reg src)
3458 {
3459 /* Resolve recursive reladdr scratch access by calling ourselves
3460 * with src.reladdr
3461 */
3462 if (src.reladdr)
3463 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3464 *src.reladdr);
3465
3466 /* Now handle scratch access on src */
3467 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3468 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3469 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3470 src.reg = temp.reg;
3471 src.reg_offset = temp.reg_offset;
3472 src.reladdr = NULL;
3473 }
3474
3475 return src;
3476 }
3477
3478 /**
3479 * We can't generally support array access in GRF space, because a
3480 * single instruction's destination can only span 2 contiguous
3481 * registers. So, we send all GRF arrays that get variable index
3482 * access to scratch space.
3483 */
3484 void
3485 vec4_visitor::move_grf_array_access_to_scratch()
3486 {
3487 int scratch_loc[this->alloc.count];
3488 memset(scratch_loc, -1, sizeof(scratch_loc));
3489
3490 /* First, calculate the set of virtual GRFs that need to be punted
3491 * to scratch due to having any array access on them, and where in
3492 * scratch.
3493 */
3494 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3495 if (inst->dst.file == GRF && inst->dst.reladdr) {
3496 if (scratch_loc[inst->dst.reg] == -1) {
3497 scratch_loc[inst->dst.reg] = last_scratch;
3498 last_scratch += this->alloc.sizes[inst->dst.reg];
3499 }
3500
3501 for (src_reg *iter = inst->dst.reladdr;
3502 iter->reladdr;
3503 iter = iter->reladdr) {
3504 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3505 scratch_loc[iter->reg] = last_scratch;
3506 last_scratch += this->alloc.sizes[iter->reg];
3507 }
3508 }
3509 }
3510
3511 for (int i = 0 ; i < 3; i++) {
3512 for (src_reg *iter = &inst->src[i];
3513 iter->reladdr;
3514 iter = iter->reladdr) {
3515 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3516 scratch_loc[iter->reg] = last_scratch;
3517 last_scratch += this->alloc.sizes[iter->reg];
3518 }
3519 }
3520 }
3521 }
3522
3523 /* Now, for anything that will be accessed through scratch, rewrite
3524 * it to load/store. Note that this is a _safe list walk, because
3525 * we may generate a new scratch_write instruction after the one
3526 * we're processing.
3527 */
3528 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3529 /* Set up the annotation tracking for new generated instructions. */
3530 base_ir = inst->ir;
3531 current_annotation = inst->annotation;
3532
3533 /* First handle scratch access on the dst. Notice we have to handle
3534 * the case where the dst's reladdr also points to scratch space.
3535 */
3536 if (inst->dst.reladdr)
3537 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3538 *inst->dst.reladdr);
3539
3540 /* Now that we have handled any (possibly recursive) reladdr scratch
3541 * accesses for dst we can safely do the scratch write for dst itself
3542 */
3543 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3544 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3545
3546 /* Now handle scratch access on any src. In this case, since inst->src[i]
3547 * already is a src_reg, we can just call emit_resolve_reladdr with
3548 * inst->src[i] and it will take care of handling scratch loads for
3549 * both src and src.reladdr (recursively).
3550 */
3551 for (int i = 0 ; i < 3; i++) {
3552 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3553 inst->src[i]);
3554 }
3555 }
3556 }
3557
3558 /**
3559 * Emits an instruction before @inst to load the value named by @orig_src
3560 * from the pull constant buffer (surface) at @base_offset to @temp.
3561 */
3562 void
3563 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3564 dst_reg temp, src_reg orig_src,
3565 int base_offset)
3566 {
3567 int reg_offset = base_offset + orig_src.reg_offset;
3568 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3569 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3570 reg_offset);
3571
3572 emit_pull_constant_load_reg(temp,
3573 index,
3574 offset,
3575 block, inst);
3576 }
3577
3578 /**
3579 * Implements array access of uniforms by inserting a
3580 * PULL_CONSTANT_LOAD instruction.
3581 *
3582 * Unlike temporary GRF array access (where we don't support it due to
3583 * the difficulty of doing relative addressing on instruction
3584 * destinations), we could potentially do array access of uniforms
3585 * that were loaded in GRF space as push constants. In real-world
3586 * usage we've seen, though, the arrays being used are always larger
3587 * than we could load as push constants, so just always move all
3588 * uniform array access out to a pull constant buffer.
3589 */
3590 void
3591 vec4_visitor::move_uniform_array_access_to_pull_constants()
3592 {
3593 int pull_constant_loc[this->uniforms];
3594 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3595 bool nested_reladdr;
3596
3597 /* Walk through and find array access of uniforms. Put a copy of that
3598 * uniform in the pull constant buffer.
3599 *
3600 * Note that we don't move constant-indexed accesses to arrays. No
3601 * testing has been done of the performance impact of this choice.
3602 */
3603 do {
3604 nested_reladdr = false;
3605
3606 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3607 for (int i = 0 ; i < 3; i++) {
3608 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3609 continue;
3610
3611 int uniform = inst->src[i].reg;
3612
3613 if (inst->src[i].reladdr->reladdr)
3614 nested_reladdr = true; /* will need another pass */
3615
3616 /* If this array isn't already present in the pull constant buffer,
3617 * add it.
3618 */
3619 if (pull_constant_loc[uniform] == -1) {
3620 const gl_constant_value **values =
3621 &stage_prog_data->param[uniform * 4];
3622
3623 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3624
3625 assert(uniform < uniform_array_size);
3626 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3627 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3628 = values[j];
3629 }
3630 }
3631
3632 /* Set up the annotation tracking for new generated instructions. */
3633 base_ir = inst->ir;
3634 current_annotation = inst->annotation;
3635
3636 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3637
3638 emit_pull_constant_load(block, inst, temp, inst->src[i],
3639 pull_constant_loc[uniform]);
3640
3641 inst->src[i].file = temp.file;
3642 inst->src[i].reg = temp.reg;
3643 inst->src[i].reg_offset = temp.reg_offset;
3644 inst->src[i].reladdr = NULL;
3645 }
3646 }
3647 } while (nested_reladdr);
3648
3649 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3650 * no need to track them as larger-than-vec4 objects. This will be
3651 * relied on in cutting out unused uniform vectors from push
3652 * constants.
3653 */
3654 split_uniform_registers();
3655 }
3656
3657 void
3658 vec4_visitor::resolve_ud_negate(src_reg *reg)
3659 {
3660 if (reg->type != BRW_REGISTER_TYPE_UD ||
3661 !reg->negate)
3662 return;
3663
3664 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3665 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3666 *reg = temp;
3667 }
3668
3669 /**
3670 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3671 *
3672 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3673 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3674 */
3675 void
3676 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3677 {
3678 assert(devinfo->gen <= 5);
3679
3680 if (!rvalue->type->is_boolean())
3681 return;
3682
3683 src_reg and_result = src_reg(this, rvalue->type);
3684 src_reg neg_result = src_reg(this, rvalue->type);
3685 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3686 emit(MOV(dst_reg(neg_result), negate(and_result)));
3687 *reg = neg_result;
3688 }
3689
3690 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3691 void *log_data,
3692 struct gl_program *prog,
3693 const struct brw_vue_prog_key *key,
3694 struct brw_vue_prog_data *prog_data,
3695 struct gl_shader_program *shader_prog,
3696 gl_shader_stage stage,
3697 void *mem_ctx,
3698 bool no_spills,
3699 int shader_time_index)
3700 : backend_shader(compiler, log_data, mem_ctx,
3701 shader_prog, prog, &prog_data->base, stage),
3702 key(key),
3703 prog_data(prog_data),
3704 sanity_param_count(0),
3705 fail_msg(NULL),
3706 first_non_payload_grf(0),
3707 need_all_constants_in_pull_buffer(false),
3708 no_spills(no_spills),
3709 shader_time_index(shader_time_index),
3710 last_scratch(0)
3711 {
3712 this->failed = false;
3713
3714 this->base_ir = NULL;
3715 this->current_annotation = NULL;
3716 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3717
3718 this->variable_ht = hash_table_ctor(0,
3719 hash_table_pointer_hash,
3720 hash_table_pointer_compare);
3721
3722 this->virtual_grf_start = NULL;
3723 this->virtual_grf_end = NULL;
3724 this->live_intervals = NULL;
3725
3726 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3727
3728 this->uniforms = 0;
3729
3730 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3731 * at least one. See setup_uniforms() in brw_vec4.cpp.
3732 */
3733 this->uniform_array_size = 1;
3734 if (prog_data) {
3735 this->uniform_array_size =
3736 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3737 }
3738
3739 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3740 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3741 }
3742
3743 vec4_visitor::~vec4_visitor()
3744 {
3745 hash_table_dtor(this->variable_ht);
3746 }
3747
3748
3749 void
3750 vec4_visitor::fail(const char *format, ...)
3751 {
3752 va_list va;
3753 char *msg;
3754
3755 if (failed)
3756 return;
3757
3758 failed = true;
3759
3760 va_start(va, format);
3761 msg = ralloc_vasprintf(mem_ctx, format, va);
3762 va_end(va);
3763 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3764
3765 this->fail_msg = msg;
3766
3767 if (debug_enabled) {
3768 fprintf(stderr, "%s", msg);
3769 }
3770 }
3771
3772 } /* namespace brw */