i965/nir/vec4: Prepare source and destination registers for ALU operations
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (devinfo->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (devinfo->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575 /**
576 * Returns the minimum number of vec4 elements needed to pack a type.
577 *
578 * For simple types, it will return 1 (a single vec4); for matrices, the
579 * number of columns; for array and struct, the sum of the vec4_size of
580 * each of its elements; and for sampler and atomic, zero.
581 *
582 * This method is useful to calculate how much register space is needed to
583 * store a particular type.
584 */
585 int
586 vec4_visitor::type_size(const struct glsl_type *type)
587 {
588 unsigned int i;
589 int size;
590
591 switch (type->base_type) {
592 case GLSL_TYPE_UINT:
593 case GLSL_TYPE_INT:
594 case GLSL_TYPE_FLOAT:
595 case GLSL_TYPE_BOOL:
596 if (type->is_matrix()) {
597 return type->matrix_columns;
598 } else {
599 /* Regardless of size of vector, it gets a vec4. This is bad
600 * packing for things like floats, but otherwise arrays become a
601 * mess. Hopefully a later pass over the code can pack scalars
602 * down if appropriate.
603 */
604 return 1;
605 }
606 case GLSL_TYPE_ARRAY:
607 assert(type->length > 0);
608 return type_size(type->fields.array) * type->length;
609 case GLSL_TYPE_STRUCT:
610 size = 0;
611 for (i = 0; i < type->length; i++) {
612 size += type_size(type->fields.structure[i].type);
613 }
614 return size;
615 case GLSL_TYPE_SUBROUTINE:
616 return 1;
617
618 case GLSL_TYPE_SAMPLER:
619 /* Samplers take up no register space, since they're baked in at
620 * link time.
621 */
622 return 0;
623 case GLSL_TYPE_ATOMIC_UINT:
624 return 0;
625 case GLSL_TYPE_IMAGE:
626 case GLSL_TYPE_VOID:
627 case GLSL_TYPE_DOUBLE:
628 case GLSL_TYPE_ERROR:
629 case GLSL_TYPE_INTERFACE:
630 unreachable("not reached");
631 }
632
633 return 0;
634 }
635
636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
637 {
638 init();
639
640 this->file = GRF;
641 this->reg = v->alloc.allocate(v->type_size(type));
642
643 if (type->is_array() || type->is_record()) {
644 this->swizzle = BRW_SWIZZLE_NOOP;
645 } else {
646 this->swizzle = brw_swizzle_for_size(type->vector_elements);
647 }
648
649 this->type = brw_type_for_base_type(type);
650 }
651
652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
653 {
654 assert(size > 0);
655
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(v->type_size(type) * size);
660
661 this->swizzle = BRW_SWIZZLE_NOOP;
662
663 this->type = brw_type_for_base_type(type);
664 }
665
666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
667 {
668 init();
669
670 this->file = GRF;
671 this->reg = v->alloc.allocate(v->type_size(type));
672
673 if (type->is_array() || type->is_record()) {
674 this->writemask = WRITEMASK_XYZW;
675 } else {
676 this->writemask = (1 << type->vector_elements) - 1;
677 }
678
679 this->type = brw_type_for_base_type(type);
680 }
681
682 void
683 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
684 unsigned n)
685 {
686 static const gl_constant_value zero = { 0 };
687
688 for (unsigned i = 0; i < n; ++i)
689 stage_prog_data->param[4 * uniforms + i] = &values[i];
690
691 for (unsigned i = n; i < 4; ++i)
692 stage_prog_data->param[4 * uniforms + i] = &zero;
693
694 uniform_vector_size[uniforms++] = n;
695 }
696
697 /* Our support for uniforms is piggy-backed on the struct
698 * gl_fragment_program, because that's where the values actually
699 * get stored, rather than in some global gl_shader_program uniform
700 * store.
701 */
702 void
703 vec4_visitor::setup_uniform_values(ir_variable *ir)
704 {
705 int namelen = strlen(ir->name);
706
707 /* The data for our (non-builtin) uniforms is stored in a series of
708 * gl_uniform_driver_storage structs for each subcomponent that
709 * glGetUniformLocation() could name. We know it's been set up in the same
710 * order we'd walk the type, so walk the list of storage and find anything
711 * with our name, or the prefix of a component that starts with our name.
712 */
713 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
714 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
715
716 if (storage->builtin)
717 continue;
718
719 if (strncmp(ir->name, storage->name, namelen) != 0 ||
720 (storage->name[namelen] != 0 &&
721 storage->name[namelen] != '.' &&
722 storage->name[namelen] != '[')) {
723 continue;
724 }
725
726 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
727 storage->type->matrix_columns);
728 const unsigned vector_size = storage->type->vector_elements;
729
730 for (unsigned s = 0; s < vector_count; s++)
731 setup_vector_uniform_values(&storage->storage[s * vector_size],
732 vector_size);
733 }
734 }
735
736 void
737 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
738 {
739 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
740 assert(this->uniforms < uniform_array_size);
741 this->uniform_vector_size[this->uniforms] = 4;
742 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
743 this->userplane[i].type = BRW_REGISTER_TYPE_F;
744 for (int j = 0; j < 4; ++j) {
745 stage_prog_data->param[this->uniforms * 4 + j] =
746 (gl_constant_value *) &clip_planes[i][j];
747 }
748 ++this->uniforms;
749 }
750 }
751
752 /* Our support for builtin uniforms is even scarier than non-builtin.
753 * It sits on top of the PROG_STATE_VAR parameters that are
754 * automatically updated from GL context state.
755 */
756 void
757 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
758 {
759 const ir_state_slot *const slots = ir->get_state_slots();
760 assert(slots != NULL);
761
762 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
763 /* This state reference has already been setup by ir_to_mesa,
764 * but we'll get the same index back here. We can reference
765 * ParameterValues directly, since unlike brw_fs.cpp, we never
766 * add new state references during compile.
767 */
768 int index = _mesa_add_state_reference(this->prog->Parameters,
769 (gl_state_index *)slots[i].tokens);
770 gl_constant_value *values =
771 &this->prog->Parameters->ParameterValues[index][0];
772
773 assert(this->uniforms < uniform_array_size);
774
775 for (unsigned j = 0; j < 4; j++)
776 stage_prog_data->param[this->uniforms * 4 + j] =
777 &values[GET_SWZ(slots[i].swizzle, j)];
778
779 this->uniform_vector_size[this->uniforms] =
780 (ir->type->is_scalar() || ir->type->is_vector() ||
781 ir->type->is_matrix() ? ir->type->vector_elements : 4);
782
783 this->uniforms++;
784 }
785 }
786
787 dst_reg *
788 vec4_visitor::variable_storage(ir_variable *var)
789 {
790 return (dst_reg *)hash_table_find(this->variable_ht, var);
791 }
792
793 void
794 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
795 enum brw_predicate *predicate)
796 {
797 ir_expression *expr = ir->as_expression();
798
799 *predicate = BRW_PREDICATE_NORMAL;
800
801 if (expr && expr->operation != ir_binop_ubo_load) {
802 src_reg op[3];
803 vec4_instruction *inst;
804
805 assert(expr->get_num_operands() <= 3);
806 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
807 expr->operands[i]->accept(this);
808 op[i] = this->result;
809
810 resolve_ud_negate(&op[i]);
811 }
812
813 switch (expr->operation) {
814 case ir_unop_logic_not:
815 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
816 inst->conditional_mod = BRW_CONDITIONAL_Z;
817 break;
818
819 case ir_binop_logic_xor:
820 if (devinfo->gen <= 5) {
821 src_reg temp = src_reg(this, ir->type);
822 emit(XOR(dst_reg(temp), op[0], op[1]));
823 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
824 } else {
825 inst = emit(XOR(dst_null_d(), op[0], op[1]));
826 }
827 inst->conditional_mod = BRW_CONDITIONAL_NZ;
828 break;
829
830 case ir_binop_logic_or:
831 if (devinfo->gen <= 5) {
832 src_reg temp = src_reg(this, ir->type);
833 emit(OR(dst_reg(temp), op[0], op[1]));
834 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
835 } else {
836 inst = emit(OR(dst_null_d(), op[0], op[1]));
837 }
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 break;
840
841 case ir_binop_logic_and:
842 if (devinfo->gen <= 5) {
843 src_reg temp = src_reg(this, ir->type);
844 emit(AND(dst_reg(temp), op[0], op[1]));
845 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
846 } else {
847 inst = emit(AND(dst_null_d(), op[0], op[1]));
848 }
849 inst->conditional_mod = BRW_CONDITIONAL_NZ;
850 break;
851
852 case ir_unop_f2b:
853 if (devinfo->gen >= 6) {
854 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
855 } else {
856 inst = emit(MOV(dst_null_f(), op[0]));
857 inst->conditional_mod = BRW_CONDITIONAL_NZ;
858 }
859 break;
860
861 case ir_unop_i2b:
862 if (devinfo->gen >= 6) {
863 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
864 } else {
865 inst = emit(MOV(dst_null_d(), op[0]));
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 }
868 break;
869
870 case ir_binop_all_equal:
871 if (devinfo->gen <= 5) {
872 resolve_bool_comparison(expr->operands[0], &op[0]);
873 resolve_bool_comparison(expr->operands[1], &op[1]);
874 }
875 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
876 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
877 break;
878
879 case ir_binop_any_nequal:
880 if (devinfo->gen <= 5) {
881 resolve_bool_comparison(expr->operands[0], &op[0]);
882 resolve_bool_comparison(expr->operands[1], &op[1]);
883 }
884 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
885 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
886 break;
887
888 case ir_unop_any:
889 if (devinfo->gen <= 5) {
890 resolve_bool_comparison(expr->operands[0], &op[0]);
891 }
892 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
893 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
894 break;
895
896 case ir_binop_greater:
897 case ir_binop_gequal:
898 case ir_binop_less:
899 case ir_binop_lequal:
900 case ir_binop_equal:
901 case ir_binop_nequal:
902 if (devinfo->gen <= 5) {
903 resolve_bool_comparison(expr->operands[0], &op[0]);
904 resolve_bool_comparison(expr->operands[1], &op[1]);
905 }
906 emit(CMP(dst_null_d(), op[0], op[1],
907 brw_conditional_for_comparison(expr->operation)));
908 break;
909
910 case ir_triop_csel: {
911 /* Expand the boolean condition into the flag register. */
912 inst = emit(MOV(dst_null_d(), op[0]));
913 inst->conditional_mod = BRW_CONDITIONAL_NZ;
914
915 /* Select which boolean to return. */
916 dst_reg temp(this, expr->operands[1]->type);
917 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
918 inst->predicate = BRW_PREDICATE_NORMAL;
919
920 /* Expand the result to a condition code. */
921 inst = emit(MOV(dst_null_d(), src_reg(temp)));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923 break;
924 }
925
926 default:
927 unreachable("not reached");
928 }
929 return;
930 }
931
932 ir->accept(this);
933
934 resolve_ud_negate(&this->result);
935
936 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
937 inst->conditional_mod = BRW_CONDITIONAL_NZ;
938 }
939
940 /**
941 * Emit a gen6 IF statement with the comparison folded into the IF
942 * instruction.
943 */
944 void
945 vec4_visitor::emit_if_gen6(ir_if *ir)
946 {
947 ir_expression *expr = ir->condition->as_expression();
948
949 if (expr && expr->operation != ir_binop_ubo_load) {
950 src_reg op[3];
951 dst_reg temp;
952
953 assert(expr->get_num_operands() <= 3);
954 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
955 expr->operands[i]->accept(this);
956 op[i] = this->result;
957 }
958
959 switch (expr->operation) {
960 case ir_unop_logic_not:
961 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
962 return;
963
964 case ir_binop_logic_xor:
965 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
966 return;
967
968 case ir_binop_logic_or:
969 temp = dst_reg(this, glsl_type::bool_type);
970 emit(OR(temp, op[0], op[1]));
971 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_binop_logic_and:
975 temp = dst_reg(this, glsl_type::bool_type);
976 emit(AND(temp, op[0], op[1]));
977 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
978 return;
979
980 case ir_unop_f2b:
981 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
982 return;
983
984 case ir_unop_i2b:
985 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
986 return;
987
988 case ir_binop_greater:
989 case ir_binop_gequal:
990 case ir_binop_less:
991 case ir_binop_lequal:
992 case ir_binop_equal:
993 case ir_binop_nequal:
994 emit(IF(op[0], op[1],
995 brw_conditional_for_comparison(expr->operation)));
996 return;
997
998 case ir_binop_all_equal:
999 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1000 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1001 return;
1002
1003 case ir_binop_any_nequal:
1004 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1005 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1006 return;
1007
1008 case ir_unop_any:
1009 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1010 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1011 return;
1012
1013 case ir_triop_csel: {
1014 /* Expand the boolean condition into the flag register. */
1015 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1016 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1017
1018 /* Select which boolean to return. */
1019 dst_reg temp(this, expr->operands[1]->type);
1020 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1021 inst->predicate = BRW_PREDICATE_NORMAL;
1022
1023 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1024 return;
1025 }
1026
1027 default:
1028 unreachable("not reached");
1029 }
1030 return;
1031 }
1032
1033 ir->condition->accept(this);
1034
1035 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1036 }
1037
1038 void
1039 vec4_visitor::visit(ir_variable *ir)
1040 {
1041 dst_reg *reg = NULL;
1042
1043 if (variable_storage(ir))
1044 return;
1045
1046 switch (ir->data.mode) {
1047 case ir_var_shader_in:
1048 assert(ir->data.location != -1);
1049 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1050 break;
1051
1052 case ir_var_shader_out:
1053 assert(ir->data.location != -1);
1054 reg = new(mem_ctx) dst_reg(this, ir->type);
1055
1056 for (int i = 0; i < type_size(ir->type); i++) {
1057 output_reg[ir->data.location + i] = *reg;
1058 output_reg[ir->data.location + i].reg_offset = i;
1059 output_reg_annotation[ir->data.location + i] = ir->name;
1060 }
1061 break;
1062
1063 case ir_var_auto:
1064 case ir_var_temporary:
1065 reg = new(mem_ctx) dst_reg(this, ir->type);
1066 break;
1067
1068 case ir_var_uniform:
1069 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1070
1071 /* Thanks to the lower_ubo_reference pass, we will see only
1072 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1073 * variables, so no need for them to be in variable_ht.
1074 *
1075 * Some uniforms, such as samplers and atomic counters, have no actual
1076 * storage, so we should ignore them.
1077 */
1078 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1079 return;
1080
1081 /* Track how big the whole uniform variable is, in case we need to put a
1082 * copy of its data into pull constants for array access.
1083 */
1084 assert(this->uniforms < uniform_array_size);
1085 this->uniform_size[this->uniforms] = type_size(ir->type);
1086
1087 if (!strncmp(ir->name, "gl_", 3)) {
1088 setup_builtin_uniform_values(ir);
1089 } else {
1090 setup_uniform_values(ir);
1091 }
1092 break;
1093
1094 case ir_var_system_value:
1095 reg = make_reg_for_system_value(ir->data.location, ir->type);
1096 break;
1097
1098 default:
1099 unreachable("not reached");
1100 }
1101
1102 reg->type = brw_type_for_base_type(ir->type);
1103 hash_table_insert(this->variable_ht, reg, ir);
1104 }
1105
1106 void
1107 vec4_visitor::visit(ir_loop *ir)
1108 {
1109 /* We don't want debugging output to print the whole body of the
1110 * loop as the annotation.
1111 */
1112 this->base_ir = NULL;
1113
1114 emit(BRW_OPCODE_DO);
1115
1116 visit_instructions(&ir->body_instructions);
1117
1118 emit(BRW_OPCODE_WHILE);
1119 }
1120
1121 void
1122 vec4_visitor::visit(ir_loop_jump *ir)
1123 {
1124 switch (ir->mode) {
1125 case ir_loop_jump::jump_break:
1126 emit(BRW_OPCODE_BREAK);
1127 break;
1128 case ir_loop_jump::jump_continue:
1129 emit(BRW_OPCODE_CONTINUE);
1130 break;
1131 }
1132 }
1133
1134
1135 void
1136 vec4_visitor::visit(ir_function_signature *)
1137 {
1138 unreachable("not reached");
1139 }
1140
1141 void
1142 vec4_visitor::visit(ir_function *ir)
1143 {
1144 /* Ignore function bodies other than main() -- we shouldn't see calls to
1145 * them since they should all be inlined.
1146 */
1147 if (strcmp(ir->name, "main") == 0) {
1148 const ir_function_signature *sig;
1149 exec_list empty;
1150
1151 sig = ir->matching_signature(NULL, &empty, false);
1152
1153 assert(sig);
1154
1155 visit_instructions(&sig->body);
1156 }
1157 }
1158
1159 bool
1160 vec4_visitor::try_emit_mad(ir_expression *ir)
1161 {
1162 /* 3-src instructions were introduced in gen6. */
1163 if (devinfo->gen < 6)
1164 return false;
1165
1166 /* MAD can only handle floating-point data. */
1167 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1168 return false;
1169
1170 ir_rvalue *nonmul;
1171 ir_expression *mul;
1172 bool mul_negate, mul_abs;
1173
1174 for (int i = 0; i < 2; i++) {
1175 mul_negate = false;
1176 mul_abs = false;
1177
1178 mul = ir->operands[i]->as_expression();
1179 nonmul = ir->operands[1 - i];
1180
1181 if (mul && mul->operation == ir_unop_abs) {
1182 mul = mul->operands[0]->as_expression();
1183 mul_abs = true;
1184 } else if (mul && mul->operation == ir_unop_neg) {
1185 mul = mul->operands[0]->as_expression();
1186 mul_negate = true;
1187 }
1188
1189 if (mul && mul->operation == ir_binop_mul)
1190 break;
1191 }
1192
1193 if (!mul || mul->operation != ir_binop_mul)
1194 return false;
1195
1196 nonmul->accept(this);
1197 src_reg src0 = fix_3src_operand(this->result);
1198
1199 mul->operands[0]->accept(this);
1200 src_reg src1 = fix_3src_operand(this->result);
1201 src1.negate ^= mul_negate;
1202 src1.abs = mul_abs;
1203 if (mul_abs)
1204 src1.negate = false;
1205
1206 mul->operands[1]->accept(this);
1207 src_reg src2 = fix_3src_operand(this->result);
1208 src2.abs = mul_abs;
1209 if (mul_abs)
1210 src2.negate = false;
1211
1212 this->result = src_reg(this, ir->type);
1213 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1214
1215 return true;
1216 }
1217
1218 bool
1219 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1220 {
1221 /* This optimization relies on CMP setting the destination to 0 when
1222 * false. Early hardware only sets the least significant bit, and
1223 * leaves the other bits undefined. So we can't use it.
1224 */
1225 if (devinfo->gen < 6)
1226 return false;
1227
1228 ir_expression *const cmp = ir->operands[0]->as_expression();
1229
1230 if (cmp == NULL)
1231 return false;
1232
1233 switch (cmp->operation) {
1234 case ir_binop_less:
1235 case ir_binop_greater:
1236 case ir_binop_lequal:
1237 case ir_binop_gequal:
1238 case ir_binop_equal:
1239 case ir_binop_nequal:
1240 break;
1241
1242 default:
1243 return false;
1244 }
1245
1246 cmp->operands[0]->accept(this);
1247 const src_reg cmp_src0 = this->result;
1248
1249 cmp->operands[1]->accept(this);
1250 const src_reg cmp_src1 = this->result;
1251
1252 this->result = src_reg(this, ir->type);
1253
1254 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1255 brw_conditional_for_comparison(cmp->operation)));
1256
1257 /* If the comparison is false, this->result will just happen to be zero.
1258 */
1259 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1260 this->result, src_reg(1.0f));
1261 inst->predicate = BRW_PREDICATE_NORMAL;
1262 inst->predicate_inverse = true;
1263
1264 return true;
1265 }
1266
1267 void
1268 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1269 src_reg src0, src_reg src1)
1270 {
1271 vec4_instruction *inst;
1272
1273 if (devinfo->gen >= 6) {
1274 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1275 inst->conditional_mod = conditionalmod;
1276 } else {
1277 emit(CMP(dst, src0, src1, conditionalmod));
1278
1279 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1280 inst->predicate = BRW_PREDICATE_NORMAL;
1281 }
1282 }
1283
1284 void
1285 vec4_visitor::emit_lrp(const dst_reg &dst,
1286 const src_reg &x, const src_reg &y, const src_reg &a)
1287 {
1288 if (devinfo->gen >= 6) {
1289 /* Note that the instruction's argument order is reversed from GLSL
1290 * and the IR.
1291 */
1292 emit(LRP(dst,
1293 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1294 } else {
1295 /* Earlier generations don't support three source operations, so we
1296 * need to emit x*(1-a) + y*a.
1297 */
1298 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1299 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1300 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1301 y_times_a.writemask = dst.writemask;
1302 one_minus_a.writemask = dst.writemask;
1303 x_times_one_minus_a.writemask = dst.writemask;
1304
1305 emit(MUL(y_times_a, y, a));
1306 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1307 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1308 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1309 }
1310 }
1311
1312 /**
1313 * Emits the instructions needed to perform a pull constant load. before_block
1314 * and before_inst can be NULL in which case the instruction will be appended
1315 * to the end of the instruction list.
1316 */
1317 void
1318 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1319 src_reg surf_index,
1320 src_reg offset_reg,
1321 bblock_t *before_block,
1322 vec4_instruction *before_inst)
1323 {
1324 assert((before_inst == NULL && before_block == NULL) ||
1325 (before_inst && before_block));
1326
1327 vec4_instruction *pull;
1328
1329 if (devinfo->gen >= 9) {
1330 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1331 src_reg header(this, glsl_type::uvec4_type, 2);
1332
1333 pull = new(mem_ctx)
1334 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1335 dst_reg(header));
1336
1337 if (before_inst)
1338 emit_before(before_block, before_inst, pull);
1339 else
1340 emit(pull);
1341
1342 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1343 offset_reg.type);
1344 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1345
1346 if (before_inst)
1347 emit_before(before_block, before_inst, pull);
1348 else
1349 emit(pull);
1350
1351 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1352 dst,
1353 surf_index,
1354 header);
1355 pull->mlen = 2;
1356 pull->header_size = 1;
1357 } else if (devinfo->gen >= 7) {
1358 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1359
1360 grf_offset.type = offset_reg.type;
1361
1362 pull = MOV(grf_offset, offset_reg);
1363
1364 if (before_inst)
1365 emit_before(before_block, before_inst, pull);
1366 else
1367 emit(pull);
1368
1369 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1370 dst,
1371 surf_index,
1372 src_reg(grf_offset));
1373 pull->mlen = 1;
1374 } else {
1375 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1376 dst,
1377 surf_index,
1378 offset_reg);
1379 pull->base_mrf = 14;
1380 pull->mlen = 1;
1381 }
1382
1383 if (before_inst)
1384 emit_before(before_block, before_inst, pull);
1385 else
1386 emit(pull);
1387 }
1388
1389 src_reg
1390 vec4_visitor::emit_uniformize(const src_reg &src)
1391 {
1392 const src_reg chan_index(this, glsl_type::uint_type);
1393 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1394 src.type);
1395
1396 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1397 ->force_writemask_all = true;
1398 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1399 ->force_writemask_all = true;
1400
1401 return src_reg(dst);
1402 }
1403
1404 void
1405 vec4_visitor::visit(ir_expression *ir)
1406 {
1407 unsigned int operand;
1408 src_reg op[ARRAY_SIZE(ir->operands)];
1409 vec4_instruction *inst;
1410
1411 if (ir->operation == ir_binop_add) {
1412 if (try_emit_mad(ir))
1413 return;
1414 }
1415
1416 if (ir->operation == ir_unop_b2f) {
1417 if (try_emit_b2f_of_compare(ir))
1418 return;
1419 }
1420
1421 /* Storage for our result. Ideally for an assignment we'd be using
1422 * the actual storage for the result here, instead.
1423 */
1424 dst_reg result_dst(this, ir->type);
1425 src_reg result_src(result_dst);
1426
1427 if (ir->operation == ir_triop_csel) {
1428 ir->operands[1]->accept(this);
1429 op[1] = this->result;
1430 ir->operands[2]->accept(this);
1431 op[2] = this->result;
1432
1433 enum brw_predicate predicate;
1434 emit_bool_to_cond_code(ir->operands[0], &predicate);
1435 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1436 inst->predicate = predicate;
1437 this->result = result_src;
1438 return;
1439 }
1440
1441 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1442 this->result.file = BAD_FILE;
1443 ir->operands[operand]->accept(this);
1444 if (this->result.file == BAD_FILE) {
1445 fprintf(stderr, "Failed to get tree for expression operand:\n");
1446 ir->operands[operand]->fprint(stderr);
1447 exit(1);
1448 }
1449 op[operand] = this->result;
1450
1451 /* Matrix expression operands should have been broken down to vector
1452 * operations already.
1453 */
1454 assert(!ir->operands[operand]->type->is_matrix());
1455 }
1456
1457 /* If nothing special happens, this is the result. */
1458 this->result = result_src;
1459
1460 switch (ir->operation) {
1461 case ir_unop_logic_not:
1462 emit(NOT(result_dst, op[0]));
1463 break;
1464 case ir_unop_neg:
1465 op[0].negate = !op[0].negate;
1466 emit(MOV(result_dst, op[0]));
1467 break;
1468 case ir_unop_abs:
1469 op[0].abs = true;
1470 op[0].negate = false;
1471 emit(MOV(result_dst, op[0]));
1472 break;
1473
1474 case ir_unop_sign:
1475 if (ir->type->is_float()) {
1476 /* AND(val, 0x80000000) gives the sign bit.
1477 *
1478 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1479 * zero.
1480 */
1481 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1482
1483 op[0].type = BRW_REGISTER_TYPE_UD;
1484 result_dst.type = BRW_REGISTER_TYPE_UD;
1485 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1486
1487 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1488 inst->predicate = BRW_PREDICATE_NORMAL;
1489
1490 this->result.type = BRW_REGISTER_TYPE_F;
1491 } else {
1492 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1493 * -> non-negative val generates 0x00000000.
1494 * Predicated OR sets 1 if val is positive.
1495 */
1496 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1497
1498 emit(ASR(result_dst, op[0], src_reg(31)));
1499
1500 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1501 inst->predicate = BRW_PREDICATE_NORMAL;
1502 }
1503 break;
1504
1505 case ir_unop_rcp:
1506 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1507 break;
1508
1509 case ir_unop_exp2:
1510 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1511 break;
1512 case ir_unop_log2:
1513 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1514 break;
1515 case ir_unop_exp:
1516 case ir_unop_log:
1517 unreachable("not reached: should be handled by ir_explog_to_explog2");
1518 case ir_unop_sin:
1519 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1520 break;
1521 case ir_unop_cos:
1522 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1523 break;
1524
1525 case ir_unop_dFdx:
1526 case ir_unop_dFdx_coarse:
1527 case ir_unop_dFdx_fine:
1528 case ir_unop_dFdy:
1529 case ir_unop_dFdy_coarse:
1530 case ir_unop_dFdy_fine:
1531 unreachable("derivatives not valid in vertex shader");
1532
1533 case ir_unop_bitfield_reverse:
1534 emit(BFREV(result_dst, op[0]));
1535 break;
1536 case ir_unop_bit_count:
1537 emit(CBIT(result_dst, op[0]));
1538 break;
1539 case ir_unop_find_msb: {
1540 src_reg temp = src_reg(this, glsl_type::uint_type);
1541
1542 inst = emit(FBH(dst_reg(temp), op[0]));
1543 inst->dst.writemask = WRITEMASK_XYZW;
1544
1545 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1546 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1547 * subtract the result from 31 to convert the MSB count into an LSB count.
1548 */
1549
1550 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1551 temp.swizzle = BRW_SWIZZLE_NOOP;
1552 emit(MOV(result_dst, temp));
1553
1554 src_reg src_tmp = src_reg(result_dst);
1555 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1556
1557 src_tmp.negate = true;
1558 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1559 inst->predicate = BRW_PREDICATE_NORMAL;
1560 break;
1561 }
1562 case ir_unop_find_lsb:
1563 emit(FBL(result_dst, op[0]));
1564 break;
1565 case ir_unop_saturate:
1566 inst = emit(MOV(result_dst, op[0]));
1567 inst->saturate = true;
1568 break;
1569
1570 case ir_unop_noise:
1571 unreachable("not reached: should be handled by lower_noise");
1572
1573 case ir_unop_subroutine_to_int:
1574 emit(MOV(result_dst, op[0]));
1575 break;
1576
1577 case ir_binop_add:
1578 emit(ADD(result_dst, op[0], op[1]));
1579 break;
1580 case ir_binop_sub:
1581 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1582
1583 case ir_binop_mul:
1584 if (devinfo->gen < 8 && ir->type->is_integer()) {
1585 /* For integer multiplication, the MUL uses the low 16 bits of one of
1586 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1587 * accumulates in the contribution of the upper 16 bits of that
1588 * operand. If we can determine that one of the args is in the low
1589 * 16 bits, though, we can just emit a single MUL.
1590 */
1591 if (ir->operands[0]->is_uint16_constant()) {
1592 if (devinfo->gen < 7)
1593 emit(MUL(result_dst, op[0], op[1]));
1594 else
1595 emit(MUL(result_dst, op[1], op[0]));
1596 } else if (ir->operands[1]->is_uint16_constant()) {
1597 if (devinfo->gen < 7)
1598 emit(MUL(result_dst, op[1], op[0]));
1599 else
1600 emit(MUL(result_dst, op[0], op[1]));
1601 } else {
1602 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1603
1604 emit(MUL(acc, op[0], op[1]));
1605 emit(MACH(dst_null_d(), op[0], op[1]));
1606 emit(MOV(result_dst, src_reg(acc)));
1607 }
1608 } else {
1609 emit(MUL(result_dst, op[0], op[1]));
1610 }
1611 break;
1612 case ir_binop_imul_high: {
1613 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1614
1615 emit(MUL(acc, op[0], op[1]));
1616 emit(MACH(result_dst, op[0], op[1]));
1617 break;
1618 }
1619 case ir_binop_div:
1620 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1621 assert(ir->type->is_integer());
1622 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1623 break;
1624
1625 case ir_binop_carry:
1626 unreachable("Should have been lowered by carry_to_arith().");
1627
1628 case ir_binop_borrow:
1629 unreachable("Should have been lowered by borrow_to_arith().");
1630
1631 case ir_binop_mod:
1632 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1633 assert(ir->type->is_integer());
1634 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1635 break;
1636
1637 case ir_binop_less:
1638 case ir_binop_greater:
1639 case ir_binop_lequal:
1640 case ir_binop_gequal:
1641 case ir_binop_equal:
1642 case ir_binop_nequal: {
1643 if (devinfo->gen <= 5) {
1644 resolve_bool_comparison(ir->operands[0], &op[0]);
1645 resolve_bool_comparison(ir->operands[1], &op[1]);
1646 }
1647 emit(CMP(result_dst, op[0], op[1],
1648 brw_conditional_for_comparison(ir->operation)));
1649 break;
1650 }
1651
1652 case ir_binop_all_equal:
1653 if (devinfo->gen <= 5) {
1654 resolve_bool_comparison(ir->operands[0], &op[0]);
1655 resolve_bool_comparison(ir->operands[1], &op[1]);
1656 }
1657
1658 /* "==" operator producing a scalar boolean. */
1659 if (ir->operands[0]->type->is_vector() ||
1660 ir->operands[1]->type->is_vector()) {
1661 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1662 emit(MOV(result_dst, src_reg(0)));
1663 inst = emit(MOV(result_dst, src_reg(~0)));
1664 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1665 } else {
1666 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1667 }
1668 break;
1669 case ir_binop_any_nequal:
1670 if (devinfo->gen <= 5) {
1671 resolve_bool_comparison(ir->operands[0], &op[0]);
1672 resolve_bool_comparison(ir->operands[1], &op[1]);
1673 }
1674
1675 /* "!=" operator producing a scalar boolean. */
1676 if (ir->operands[0]->type->is_vector() ||
1677 ir->operands[1]->type->is_vector()) {
1678 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1679
1680 emit(MOV(result_dst, src_reg(0)));
1681 inst = emit(MOV(result_dst, src_reg(~0)));
1682 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1683 } else {
1684 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1685 }
1686 break;
1687
1688 case ir_unop_any:
1689 if (devinfo->gen <= 5) {
1690 resolve_bool_comparison(ir->operands[0], &op[0]);
1691 }
1692 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1693 emit(MOV(result_dst, src_reg(0)));
1694
1695 inst = emit(MOV(result_dst, src_reg(~0)));
1696 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1697 break;
1698
1699 case ir_binop_logic_xor:
1700 emit(XOR(result_dst, op[0], op[1]));
1701 break;
1702
1703 case ir_binop_logic_or:
1704 emit(OR(result_dst, op[0], op[1]));
1705 break;
1706
1707 case ir_binop_logic_and:
1708 emit(AND(result_dst, op[0], op[1]));
1709 break;
1710
1711 case ir_binop_dot:
1712 assert(ir->operands[0]->type->is_vector());
1713 assert(ir->operands[0]->type == ir->operands[1]->type);
1714 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1715 break;
1716
1717 case ir_unop_sqrt:
1718 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1719 break;
1720 case ir_unop_rsq:
1721 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1722 break;
1723
1724 case ir_unop_bitcast_i2f:
1725 case ir_unop_bitcast_u2f:
1726 this->result = op[0];
1727 this->result.type = BRW_REGISTER_TYPE_F;
1728 break;
1729
1730 case ir_unop_bitcast_f2i:
1731 this->result = op[0];
1732 this->result.type = BRW_REGISTER_TYPE_D;
1733 break;
1734
1735 case ir_unop_bitcast_f2u:
1736 this->result = op[0];
1737 this->result.type = BRW_REGISTER_TYPE_UD;
1738 break;
1739
1740 case ir_unop_i2f:
1741 case ir_unop_i2u:
1742 case ir_unop_u2i:
1743 case ir_unop_u2f:
1744 case ir_unop_f2i:
1745 case ir_unop_f2u:
1746 emit(MOV(result_dst, op[0]));
1747 break;
1748 case ir_unop_b2i:
1749 case ir_unop_b2f:
1750 if (devinfo->gen <= 5) {
1751 resolve_bool_comparison(ir->operands[0], &op[0]);
1752 }
1753 emit(MOV(result_dst, negate(op[0])));
1754 break;
1755 case ir_unop_f2b:
1756 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1757 break;
1758 case ir_unop_i2b:
1759 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1760 break;
1761
1762 case ir_unop_trunc:
1763 emit(RNDZ(result_dst, op[0]));
1764 break;
1765 case ir_unop_ceil: {
1766 src_reg tmp = src_reg(this, ir->type);
1767 op[0].negate = !op[0].negate;
1768 emit(RNDD(dst_reg(tmp), op[0]));
1769 tmp.negate = true;
1770 emit(MOV(result_dst, tmp));
1771 }
1772 break;
1773 case ir_unop_floor:
1774 inst = emit(RNDD(result_dst, op[0]));
1775 break;
1776 case ir_unop_fract:
1777 inst = emit(FRC(result_dst, op[0]));
1778 break;
1779 case ir_unop_round_even:
1780 emit(RNDE(result_dst, op[0]));
1781 break;
1782
1783 case ir_binop_min:
1784 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1785 break;
1786 case ir_binop_max:
1787 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1788 break;
1789
1790 case ir_binop_pow:
1791 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1792 break;
1793
1794 case ir_unop_bit_not:
1795 inst = emit(NOT(result_dst, op[0]));
1796 break;
1797 case ir_binop_bit_and:
1798 inst = emit(AND(result_dst, op[0], op[1]));
1799 break;
1800 case ir_binop_bit_xor:
1801 inst = emit(XOR(result_dst, op[0], op[1]));
1802 break;
1803 case ir_binop_bit_or:
1804 inst = emit(OR(result_dst, op[0], op[1]));
1805 break;
1806
1807 case ir_binop_lshift:
1808 inst = emit(SHL(result_dst, op[0], op[1]));
1809 break;
1810
1811 case ir_binop_rshift:
1812 if (ir->type->base_type == GLSL_TYPE_INT)
1813 inst = emit(ASR(result_dst, op[0], op[1]));
1814 else
1815 inst = emit(SHR(result_dst, op[0], op[1]));
1816 break;
1817
1818 case ir_binop_bfm:
1819 emit(BFI1(result_dst, op[0], op[1]));
1820 break;
1821
1822 case ir_binop_ubo_load: {
1823 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1824 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1825 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1826 src_reg offset;
1827
1828 /* Now, load the vector from that offset. */
1829 assert(ir->type->is_vector() || ir->type->is_scalar());
1830
1831 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1832 packed_consts.type = result.type;
1833 src_reg surf_index;
1834
1835 if (const_uniform_block) {
1836 /* The block index is a constant, so just emit the binding table entry
1837 * as an immediate.
1838 */
1839 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1840 const_uniform_block->value.u[0]);
1841 } else {
1842 /* The block index is not a constant. Evaluate the index expression
1843 * per-channel and add the base UBO index; we have to select a value
1844 * from any live channel.
1845 */
1846 surf_index = src_reg(this, glsl_type::uint_type);
1847 emit(ADD(dst_reg(surf_index), op[0],
1848 src_reg(prog_data->base.binding_table.ubo_start)));
1849 surf_index = emit_uniformize(surf_index);
1850
1851 /* Assume this may touch any UBO. It would be nice to provide
1852 * a tighter bound, but the array information is already lowered away.
1853 */
1854 brw_mark_surface_used(&prog_data->base,
1855 prog_data->base.binding_table.ubo_start +
1856 shader_prog->NumUniformBlocks - 1);
1857 }
1858
1859 if (const_offset_ir) {
1860 if (devinfo->gen >= 8) {
1861 /* Store the offset in a GRF so we can send-from-GRF. */
1862 offset = src_reg(this, glsl_type::int_type);
1863 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1864 } else {
1865 /* Immediates are fine on older generations since they'll be moved
1866 * to a (potentially fake) MRF at the generator level.
1867 */
1868 offset = src_reg(const_offset / 16);
1869 }
1870 } else {
1871 offset = src_reg(this, glsl_type::uint_type);
1872 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1873 }
1874
1875 emit_pull_constant_load_reg(dst_reg(packed_consts),
1876 surf_index,
1877 offset,
1878 NULL, NULL /* before_block/inst */);
1879
1880 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1881 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1882 const_offset % 16 / 4,
1883 const_offset % 16 / 4,
1884 const_offset % 16 / 4);
1885
1886 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1887 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1888 emit(CMP(result_dst, packed_consts, src_reg(0u),
1889 BRW_CONDITIONAL_NZ));
1890 } else {
1891 emit(MOV(result_dst, packed_consts));
1892 }
1893 break;
1894 }
1895
1896 case ir_binop_vector_extract:
1897 unreachable("should have been lowered by vec_index_to_cond_assign");
1898
1899 case ir_triop_fma:
1900 op[0] = fix_3src_operand(op[0]);
1901 op[1] = fix_3src_operand(op[1]);
1902 op[2] = fix_3src_operand(op[2]);
1903 /* Note that the instruction's argument order is reversed from GLSL
1904 * and the IR.
1905 */
1906 emit(MAD(result_dst, op[2], op[1], op[0]));
1907 break;
1908
1909 case ir_triop_lrp:
1910 emit_lrp(result_dst, op[0], op[1], op[2]);
1911 break;
1912
1913 case ir_triop_csel:
1914 unreachable("already handled above");
1915 break;
1916
1917 case ir_triop_bfi:
1918 op[0] = fix_3src_operand(op[0]);
1919 op[1] = fix_3src_operand(op[1]);
1920 op[2] = fix_3src_operand(op[2]);
1921 emit(BFI2(result_dst, op[0], op[1], op[2]));
1922 break;
1923
1924 case ir_triop_bitfield_extract:
1925 op[0] = fix_3src_operand(op[0]);
1926 op[1] = fix_3src_operand(op[1]);
1927 op[2] = fix_3src_operand(op[2]);
1928 /* Note that the instruction's argument order is reversed from GLSL
1929 * and the IR.
1930 */
1931 emit(BFE(result_dst, op[2], op[1], op[0]));
1932 break;
1933
1934 case ir_triop_vector_insert:
1935 unreachable("should have been lowered by lower_vector_insert");
1936
1937 case ir_quadop_bitfield_insert:
1938 unreachable("not reached: should be handled by "
1939 "bitfield_insert_to_bfm_bfi\n");
1940
1941 case ir_quadop_vector:
1942 unreachable("not reached: should be handled by lower_quadop_vector");
1943
1944 case ir_unop_pack_half_2x16:
1945 emit_pack_half_2x16(result_dst, op[0]);
1946 break;
1947 case ir_unop_unpack_half_2x16:
1948 emit_unpack_half_2x16(result_dst, op[0]);
1949 break;
1950 case ir_unop_unpack_unorm_4x8:
1951 emit_unpack_unorm_4x8(result_dst, op[0]);
1952 break;
1953 case ir_unop_unpack_snorm_4x8:
1954 emit_unpack_snorm_4x8(result_dst, op[0]);
1955 break;
1956 case ir_unop_pack_unorm_4x8:
1957 emit_pack_unorm_4x8(result_dst, op[0]);
1958 break;
1959 case ir_unop_pack_snorm_4x8:
1960 emit_pack_snorm_4x8(result_dst, op[0]);
1961 break;
1962 case ir_unop_pack_snorm_2x16:
1963 case ir_unop_pack_unorm_2x16:
1964 case ir_unop_unpack_snorm_2x16:
1965 case ir_unop_unpack_unorm_2x16:
1966 unreachable("not reached: should be handled by lower_packing_builtins");
1967 case ir_unop_unpack_half_2x16_split_x:
1968 case ir_unop_unpack_half_2x16_split_y:
1969 case ir_binop_pack_half_2x16_split:
1970 case ir_unop_interpolate_at_centroid:
1971 case ir_binop_interpolate_at_sample:
1972 case ir_binop_interpolate_at_offset:
1973 unreachable("not reached: should not occur in vertex shader");
1974 case ir_binop_ldexp:
1975 unreachable("not reached: should be handled by ldexp_to_arith()");
1976 case ir_unop_d2f:
1977 case ir_unop_f2d:
1978 case ir_unop_d2i:
1979 case ir_unop_i2d:
1980 case ir_unop_d2u:
1981 case ir_unop_u2d:
1982 case ir_unop_d2b:
1983 case ir_unop_pack_double_2x32:
1984 case ir_unop_unpack_double_2x32:
1985 case ir_unop_frexp_sig:
1986 case ir_unop_frexp_exp:
1987 unreachable("fp64 todo");
1988 }
1989 }
1990
1991
1992 void
1993 vec4_visitor::visit(ir_swizzle *ir)
1994 {
1995 /* Note that this is only swizzles in expressions, not those on the left
1996 * hand side of an assignment, which do write masking. See ir_assignment
1997 * for that.
1998 */
1999 const unsigned swz = brw_compose_swizzle(
2000 brw_swizzle_for_size(ir->type->vector_elements),
2001 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2002
2003 ir->val->accept(this);
2004 this->result = swizzle(this->result, swz);
2005 }
2006
2007 void
2008 vec4_visitor::visit(ir_dereference_variable *ir)
2009 {
2010 const struct glsl_type *type = ir->type;
2011 dst_reg *reg = variable_storage(ir->var);
2012
2013 if (!reg) {
2014 fail("Failed to find variable storage for %s\n", ir->var->name);
2015 this->result = src_reg(brw_null_reg());
2016 return;
2017 }
2018
2019 this->result = src_reg(*reg);
2020
2021 /* System values get their swizzle from the dst_reg writemask */
2022 if (ir->var->data.mode == ir_var_system_value)
2023 return;
2024
2025 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2026 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2027 }
2028
2029
2030 int
2031 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2032 {
2033 /* Under normal circumstances array elements are stored consecutively, so
2034 * the stride is equal to the size of the array element.
2035 */
2036 return type_size(ir->type);
2037 }
2038
2039
2040 void
2041 vec4_visitor::visit(ir_dereference_array *ir)
2042 {
2043 ir_constant *constant_index;
2044 src_reg src;
2045 int array_stride = compute_array_stride(ir);
2046
2047 constant_index = ir->array_index->constant_expression_value();
2048
2049 ir->array->accept(this);
2050 src = this->result;
2051
2052 if (constant_index) {
2053 src.reg_offset += constant_index->value.i[0] * array_stride;
2054 } else {
2055 /* Variable index array dereference. It eats the "vec4" of the
2056 * base of the array and an index that offsets the Mesa register
2057 * index.
2058 */
2059 ir->array_index->accept(this);
2060
2061 src_reg index_reg;
2062
2063 if (array_stride == 1) {
2064 index_reg = this->result;
2065 } else {
2066 index_reg = src_reg(this, glsl_type::int_type);
2067
2068 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2069 }
2070
2071 if (src.reladdr) {
2072 src_reg temp = src_reg(this, glsl_type::int_type);
2073
2074 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2075
2076 index_reg = temp;
2077 }
2078
2079 src.reladdr = ralloc(mem_ctx, src_reg);
2080 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2081 }
2082
2083 /* If the type is smaller than a vec4, replicate the last channel out. */
2084 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2085 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2086 else
2087 src.swizzle = BRW_SWIZZLE_NOOP;
2088 src.type = brw_type_for_base_type(ir->type);
2089
2090 this->result = src;
2091 }
2092
2093 void
2094 vec4_visitor::visit(ir_dereference_record *ir)
2095 {
2096 unsigned int i;
2097 const glsl_type *struct_type = ir->record->type;
2098 int offset = 0;
2099
2100 ir->record->accept(this);
2101
2102 for (i = 0; i < struct_type->length; i++) {
2103 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2104 break;
2105 offset += type_size(struct_type->fields.structure[i].type);
2106 }
2107
2108 /* If the type is smaller than a vec4, replicate the last channel out. */
2109 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2110 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2111 else
2112 this->result.swizzle = BRW_SWIZZLE_NOOP;
2113 this->result.type = brw_type_for_base_type(ir->type);
2114
2115 this->result.reg_offset += offset;
2116 }
2117
2118 /**
2119 * We want to be careful in assignment setup to hit the actual storage
2120 * instead of potentially using a temporary like we might with the
2121 * ir_dereference handler.
2122 */
2123 static dst_reg
2124 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2125 {
2126 /* The LHS must be a dereference. If the LHS is a variable indexed array
2127 * access of a vector, it must be separated into a series conditional moves
2128 * before reaching this point (see ir_vec_index_to_cond_assign).
2129 */
2130 assert(ir->as_dereference());
2131 ir_dereference_array *deref_array = ir->as_dereference_array();
2132 if (deref_array) {
2133 assert(!deref_array->array->type->is_vector());
2134 }
2135
2136 /* Use the rvalue deref handler for the most part. We'll ignore
2137 * swizzles in it and write swizzles using writemask, though.
2138 */
2139 ir->accept(v);
2140 return dst_reg(v->result);
2141 }
2142
2143 void
2144 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2145 const struct glsl_type *type,
2146 enum brw_predicate predicate)
2147 {
2148 if (type->base_type == GLSL_TYPE_STRUCT) {
2149 for (unsigned int i = 0; i < type->length; i++) {
2150 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2151 }
2152 return;
2153 }
2154
2155 if (type->is_array()) {
2156 for (unsigned int i = 0; i < type->length; i++) {
2157 emit_block_move(dst, src, type->fields.array, predicate);
2158 }
2159 return;
2160 }
2161
2162 if (type->is_matrix()) {
2163 const struct glsl_type *vec_type;
2164
2165 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2166 type->vector_elements, 1);
2167
2168 for (int i = 0; i < type->matrix_columns; i++) {
2169 emit_block_move(dst, src, vec_type, predicate);
2170 }
2171 return;
2172 }
2173
2174 assert(type->is_scalar() || type->is_vector());
2175
2176 dst->type = brw_type_for_base_type(type);
2177 src->type = dst->type;
2178
2179 dst->writemask = (1 << type->vector_elements) - 1;
2180
2181 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2182
2183 vec4_instruction *inst = emit(MOV(*dst, *src));
2184 inst->predicate = predicate;
2185
2186 dst->reg_offset++;
2187 src->reg_offset++;
2188 }
2189
2190
2191 /* If the RHS processing resulted in an instruction generating a
2192 * temporary value, and it would be easy to rewrite the instruction to
2193 * generate its result right into the LHS instead, do so. This ends
2194 * up reliably removing instructions where it can be tricky to do so
2195 * later without real UD chain information.
2196 */
2197 bool
2198 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2199 dst_reg dst,
2200 src_reg src,
2201 vec4_instruction *pre_rhs_inst,
2202 vec4_instruction *last_rhs_inst)
2203 {
2204 /* This could be supported, but it would take more smarts. */
2205 if (ir->condition)
2206 return false;
2207
2208 if (pre_rhs_inst == last_rhs_inst)
2209 return false; /* No instructions generated to work with. */
2210
2211 /* Make sure the last instruction generated our source reg. */
2212 if (src.file != GRF ||
2213 src.file != last_rhs_inst->dst.file ||
2214 src.reg != last_rhs_inst->dst.reg ||
2215 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2216 src.reladdr ||
2217 src.abs ||
2218 src.negate ||
2219 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2220 return false;
2221
2222 /* Check that that last instruction fully initialized the channels
2223 * we want to use, in the order we want to use them. We could
2224 * potentially reswizzle the operands of many instructions so that
2225 * we could handle out of order channels, but don't yet.
2226 */
2227
2228 for (unsigned i = 0; i < 4; i++) {
2229 if (dst.writemask & (1 << i)) {
2230 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2231 return false;
2232
2233 if (BRW_GET_SWZ(src.swizzle, i) != i)
2234 return false;
2235 }
2236 }
2237
2238 /* Success! Rewrite the instruction. */
2239 last_rhs_inst->dst.file = dst.file;
2240 last_rhs_inst->dst.reg = dst.reg;
2241 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2242 last_rhs_inst->dst.reladdr = dst.reladdr;
2243 last_rhs_inst->dst.writemask &= dst.writemask;
2244
2245 return true;
2246 }
2247
2248 void
2249 vec4_visitor::visit(ir_assignment *ir)
2250 {
2251 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2252 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2253
2254 if (!ir->lhs->type->is_scalar() &&
2255 !ir->lhs->type->is_vector()) {
2256 ir->rhs->accept(this);
2257 src_reg src = this->result;
2258
2259 if (ir->condition) {
2260 emit_bool_to_cond_code(ir->condition, &predicate);
2261 }
2262
2263 /* emit_block_move doesn't account for swizzles in the source register.
2264 * This should be ok, since the source register is a structure or an
2265 * array, and those can't be swizzled. But double-check to be sure.
2266 */
2267 assert(src.swizzle ==
2268 (ir->rhs->type->is_matrix()
2269 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2270 : BRW_SWIZZLE_NOOP));
2271
2272 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2273 return;
2274 }
2275
2276 /* Now we're down to just a scalar/vector with writemasks. */
2277 int i;
2278
2279 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2280 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2281
2282 ir->rhs->accept(this);
2283
2284 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2285
2286 int swizzles[4];
2287 int src_chan = 0;
2288
2289 assert(ir->lhs->type->is_vector() ||
2290 ir->lhs->type->is_scalar());
2291 dst.writemask = ir->write_mask;
2292
2293 /* Swizzle a small RHS vector into the channels being written.
2294 *
2295 * glsl ir treats write_mask as dictating how many channels are
2296 * present on the RHS while in our instructions we need to make
2297 * those channels appear in the slots of the vec4 they're written to.
2298 */
2299 for (int i = 0; i < 4; i++)
2300 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2301
2302 src_reg src = swizzle(this->result,
2303 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2304 swizzles[2], swizzles[3]));
2305
2306 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2307 return;
2308 }
2309
2310 if (ir->condition) {
2311 emit_bool_to_cond_code(ir->condition, &predicate);
2312 }
2313
2314 for (i = 0; i < type_size(ir->lhs->type); i++) {
2315 vec4_instruction *inst = emit(MOV(dst, src));
2316 inst->predicate = predicate;
2317
2318 dst.reg_offset++;
2319 src.reg_offset++;
2320 }
2321 }
2322
2323 void
2324 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2325 {
2326 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2327 foreach_in_list(ir_constant, field_value, &ir->components) {
2328 emit_constant_values(dst, field_value);
2329 }
2330 return;
2331 }
2332
2333 if (ir->type->is_array()) {
2334 for (unsigned int i = 0; i < ir->type->length; i++) {
2335 emit_constant_values(dst, ir->array_elements[i]);
2336 }
2337 return;
2338 }
2339
2340 if (ir->type->is_matrix()) {
2341 for (int i = 0; i < ir->type->matrix_columns; i++) {
2342 float *vec = &ir->value.f[i * ir->type->vector_elements];
2343
2344 for (int j = 0; j < ir->type->vector_elements; j++) {
2345 dst->writemask = 1 << j;
2346 dst->type = BRW_REGISTER_TYPE_F;
2347
2348 emit(MOV(*dst, src_reg(vec[j])));
2349 }
2350 dst->reg_offset++;
2351 }
2352 return;
2353 }
2354
2355 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2356
2357 for (int i = 0; i < ir->type->vector_elements; i++) {
2358 if (!(remaining_writemask & (1 << i)))
2359 continue;
2360
2361 dst->writemask = 1 << i;
2362 dst->type = brw_type_for_base_type(ir->type);
2363
2364 /* Find other components that match the one we're about to
2365 * write. Emits fewer instructions for things like vec4(0.5,
2366 * 1.5, 1.5, 1.5).
2367 */
2368 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2369 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2370 if (ir->value.b[i] == ir->value.b[j])
2371 dst->writemask |= (1 << j);
2372 } else {
2373 /* u, i, and f storage all line up, so no need for a
2374 * switch case for comparing each type.
2375 */
2376 if (ir->value.u[i] == ir->value.u[j])
2377 dst->writemask |= (1 << j);
2378 }
2379 }
2380
2381 switch (ir->type->base_type) {
2382 case GLSL_TYPE_FLOAT:
2383 emit(MOV(*dst, src_reg(ir->value.f[i])));
2384 break;
2385 case GLSL_TYPE_INT:
2386 emit(MOV(*dst, src_reg(ir->value.i[i])));
2387 break;
2388 case GLSL_TYPE_UINT:
2389 emit(MOV(*dst, src_reg(ir->value.u[i])));
2390 break;
2391 case GLSL_TYPE_BOOL:
2392 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2393 break;
2394 default:
2395 unreachable("Non-float/uint/int/bool constant");
2396 }
2397
2398 remaining_writemask &= ~dst->writemask;
2399 }
2400 dst->reg_offset++;
2401 }
2402
2403 void
2404 vec4_visitor::visit(ir_constant *ir)
2405 {
2406 dst_reg dst = dst_reg(this, ir->type);
2407 this->result = src_reg(dst);
2408
2409 emit_constant_values(&dst, ir);
2410 }
2411
2412 void
2413 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2414 {
2415 ir_dereference *deref = static_cast<ir_dereference *>(
2416 ir->actual_parameters.get_head());
2417 ir_variable *location = deref->variable_referenced();
2418 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2419 location->data.binding);
2420
2421 /* Calculate the surface offset */
2422 src_reg offset(this, glsl_type::uint_type);
2423 ir_dereference_array *deref_array = deref->as_dereference_array();
2424 if (deref_array) {
2425 deref_array->array_index->accept(this);
2426
2427 src_reg tmp(this, glsl_type::uint_type);
2428 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2429 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2430 } else {
2431 offset = location->data.atomic.offset;
2432 }
2433
2434 /* Emit the appropriate machine instruction */
2435 const char *callee = ir->callee->function_name();
2436 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2437
2438 if (!strcmp("__intrinsic_atomic_read", callee)) {
2439 emit_untyped_surface_read(surf_index, dst, offset);
2440
2441 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2442 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2443 src_reg(), src_reg());
2444
2445 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2446 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2447 src_reg(), src_reg());
2448 }
2449
2450 brw_mark_surface_used(stage_prog_data, surf_index);
2451 }
2452
2453 void
2454 vec4_visitor::visit(ir_call *ir)
2455 {
2456 const char *callee = ir->callee->function_name();
2457
2458 if (!strcmp("__intrinsic_atomic_read", callee) ||
2459 !strcmp("__intrinsic_atomic_increment", callee) ||
2460 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2461 visit_atomic_counter_intrinsic(ir);
2462 } else {
2463 unreachable("Unsupported intrinsic.");
2464 }
2465 }
2466
2467 src_reg
2468 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2469 {
2470 vec4_instruction *inst =
2471 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2472 dst_reg(this, glsl_type::uvec4_type));
2473 inst->base_mrf = 2;
2474 inst->src[1] = sampler;
2475
2476 int param_base;
2477
2478 if (devinfo->gen >= 9) {
2479 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2480 vec4_instruction *header_inst = new(mem_ctx)
2481 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2482 dst_reg(MRF, inst->base_mrf));
2483
2484 emit(header_inst);
2485
2486 inst->mlen = 2;
2487 inst->header_size = 1;
2488 param_base = inst->base_mrf + 1;
2489 } else {
2490 inst->mlen = 1;
2491 param_base = inst->base_mrf;
2492 }
2493
2494 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2495 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2496 int zero_mask = 0xf & ~coord_mask;
2497
2498 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2499 coordinate));
2500
2501 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2502 src_reg(0)));
2503
2504 emit(inst);
2505 return src_reg(inst->dst);
2506 }
2507
2508 static bool
2509 is_high_sampler(const struct brw_device_info *devinfo, src_reg sampler)
2510 {
2511 if (devinfo->gen < 8 && !devinfo->is_haswell)
2512 return false;
2513
2514 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2515 }
2516
2517 void
2518 vec4_visitor::visit(ir_texture *ir)
2519 {
2520 uint32_t sampler =
2521 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2522
2523 ir_rvalue *nonconst_sampler_index =
2524 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2525
2526 /* Handle non-constant sampler array indexing */
2527 src_reg sampler_reg;
2528 if (nonconst_sampler_index) {
2529 /* The highest sampler which may be used by this operation is
2530 * the last element of the array. Mark it here, because the generator
2531 * doesn't have enough information to determine the bound.
2532 */
2533 uint32_t array_size = ir->sampler->as_dereference_array()
2534 ->array->type->array_size();
2535
2536 uint32_t max_used = sampler + array_size - 1;
2537 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2538 max_used += prog_data->base.binding_table.gather_texture_start;
2539 } else {
2540 max_used += prog_data->base.binding_table.texture_start;
2541 }
2542
2543 brw_mark_surface_used(&prog_data->base, max_used);
2544
2545 /* Emit code to evaluate the actual indexing expression */
2546 nonconst_sampler_index->accept(this);
2547 src_reg temp(this, glsl_type::uint_type);
2548 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2549 sampler_reg = emit_uniformize(temp);
2550 } else {
2551 /* Single sampler, or constant array index; the indexing expression
2552 * is just an immediate.
2553 */
2554 sampler_reg = src_reg(sampler);
2555 }
2556
2557 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2558 * emitting anything other than setting up the constant result.
2559 */
2560 if (ir->op == ir_tg4) {
2561 ir_constant *chan = ir->lod_info.component->as_constant();
2562 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2563 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2564 dst_reg result(this, ir->type);
2565 this->result = src_reg(result);
2566 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2567 return;
2568 }
2569 }
2570
2571 /* Should be lowered by do_lower_texture_projection */
2572 assert(!ir->projector);
2573
2574 /* Should be lowered */
2575 assert(!ir->offset || !ir->offset->type->is_array());
2576
2577 /* Generate code to compute all the subexpression trees. This has to be
2578 * done before loading any values into MRFs for the sampler message since
2579 * generating these values may involve SEND messages that need the MRFs.
2580 */
2581 src_reg coordinate;
2582 if (ir->coordinate) {
2583 ir->coordinate->accept(this);
2584 coordinate = this->result;
2585 }
2586
2587 src_reg shadow_comparitor;
2588 if (ir->shadow_comparitor) {
2589 ir->shadow_comparitor->accept(this);
2590 shadow_comparitor = this->result;
2591 }
2592
2593 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2594 src_reg offset_value;
2595 if (has_nonconstant_offset) {
2596 ir->offset->accept(this);
2597 offset_value = src_reg(this->result);
2598 }
2599
2600 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2601 src_reg lod, dPdx, dPdy, sample_index, mcs;
2602 switch (ir->op) {
2603 case ir_tex:
2604 lod = src_reg(0.0f);
2605 lod_type = glsl_type::float_type;
2606 break;
2607 case ir_txf:
2608 case ir_txl:
2609 case ir_txs:
2610 ir->lod_info.lod->accept(this);
2611 lod = this->result;
2612 lod_type = ir->lod_info.lod->type;
2613 break;
2614 case ir_query_levels:
2615 lod = src_reg(0);
2616 lod_type = glsl_type::int_type;
2617 break;
2618 case ir_txf_ms:
2619 ir->lod_info.sample_index->accept(this);
2620 sample_index = this->result;
2621 sample_index_type = ir->lod_info.sample_index->type;
2622
2623 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2624 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2625 else
2626 mcs = src_reg(0u);
2627 break;
2628 case ir_txd:
2629 ir->lod_info.grad.dPdx->accept(this);
2630 dPdx = this->result;
2631
2632 ir->lod_info.grad.dPdy->accept(this);
2633 dPdy = this->result;
2634
2635 lod_type = ir->lod_info.grad.dPdx->type;
2636 break;
2637 case ir_txb:
2638 case ir_lod:
2639 case ir_tg4:
2640 break;
2641 }
2642
2643 enum opcode opcode;
2644 switch (ir->op) {
2645 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2646 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2647 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2648 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2649 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2650 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2651 case ir_tg4: opcode = has_nonconstant_offset
2652 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2653 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2654 case ir_txb:
2655 unreachable("TXB is not valid for vertex shaders.");
2656 case ir_lod:
2657 unreachable("LOD is not valid for vertex shaders.");
2658 default:
2659 unreachable("Unrecognized tex op");
2660 }
2661
2662 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2663 opcode, dst_reg(this, ir->type));
2664
2665 if (ir->offset != NULL && !has_nonconstant_offset) {
2666 inst->offset =
2667 brw_texture_offset(ir->offset->as_constant()->value.i,
2668 ir->offset->type->vector_elements);
2669 }
2670
2671 /* Stuff the channel select bits in the top of the texture offset */
2672 if (ir->op == ir_tg4)
2673 inst->offset |= gather_channel(ir, sampler) << 16;
2674
2675 /* The message header is necessary for:
2676 * - Gen4 (always)
2677 * - Gen9+ for selecting SIMD4x2
2678 * - Texel offsets
2679 * - Gather channel selection
2680 * - Sampler indices too large to fit in a 4-bit value.
2681 */
2682 inst->header_size =
2683 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2684 inst->offset != 0 || ir->op == ir_tg4 ||
2685 is_high_sampler(devinfo, sampler_reg)) ? 1 : 0;
2686 inst->base_mrf = 2;
2687 inst->mlen = inst->header_size + 1; /* always at least one */
2688 inst->dst.writemask = WRITEMASK_XYZW;
2689 inst->shadow_compare = ir->shadow_comparitor != NULL;
2690
2691 inst->src[1] = sampler_reg;
2692
2693 /* MRF for the first parameter */
2694 int param_base = inst->base_mrf + inst->header_size;
2695
2696 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2697 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2698 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2699 } else {
2700 /* Load the coordinate */
2701 /* FINISHME: gl_clamp_mask and saturate */
2702 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2703 int zero_mask = 0xf & ~coord_mask;
2704
2705 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2706 coordinate));
2707
2708 if (zero_mask != 0) {
2709 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2710 src_reg(0)));
2711 }
2712 /* Load the shadow comparitor */
2713 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2714 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2715 WRITEMASK_X),
2716 shadow_comparitor));
2717 inst->mlen++;
2718 }
2719
2720 /* Load the LOD info */
2721 if (ir->op == ir_tex || ir->op == ir_txl) {
2722 int mrf, writemask;
2723 if (devinfo->gen >= 5) {
2724 mrf = param_base + 1;
2725 if (ir->shadow_comparitor) {
2726 writemask = WRITEMASK_Y;
2727 /* mlen already incremented */
2728 } else {
2729 writemask = WRITEMASK_X;
2730 inst->mlen++;
2731 }
2732 } else /* devinfo->gen == 4 */ {
2733 mrf = param_base;
2734 writemask = WRITEMASK_W;
2735 }
2736 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2737 } else if (ir->op == ir_txf) {
2738 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2739 } else if (ir->op == ir_txf_ms) {
2740 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2741 sample_index));
2742 if (devinfo->gen >= 7) {
2743 /* MCS data is in the first channel of `mcs`, but we need to get it into
2744 * the .y channel of the second vec4 of params, so replicate .x across
2745 * the whole vec4 and then mask off everything except .y
2746 */
2747 mcs.swizzle = BRW_SWIZZLE_XXXX;
2748 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2749 mcs));
2750 }
2751 inst->mlen++;
2752 } else if (ir->op == ir_txd) {
2753 const glsl_type *type = lod_type;
2754
2755 if (devinfo->gen >= 5) {
2756 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2757 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2758 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2759 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2760 inst->mlen++;
2761
2762 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2763 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2764 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2765 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2766 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2767 inst->mlen++;
2768
2769 if (ir->shadow_comparitor) {
2770 emit(MOV(dst_reg(MRF, param_base + 2,
2771 ir->shadow_comparitor->type, WRITEMASK_Z),
2772 shadow_comparitor));
2773 }
2774 }
2775 } else /* devinfo->gen == 4 */ {
2776 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2777 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2778 inst->mlen += 2;
2779 }
2780 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2781 if (ir->shadow_comparitor) {
2782 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2783 shadow_comparitor));
2784 }
2785
2786 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2787 offset_value));
2788 inst->mlen++;
2789 }
2790 }
2791
2792 emit(inst);
2793
2794 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2795 * spec requires layers.
2796 */
2797 if (ir->op == ir_txs) {
2798 glsl_type const *type = ir->sampler->type;
2799 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2800 type->sampler_array) {
2801 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2802 writemask(inst->dst, WRITEMASK_Z),
2803 src_reg(inst->dst), src_reg(6));
2804 }
2805 }
2806
2807 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2808 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2809 }
2810
2811 swizzle_result(ir, src_reg(inst->dst), sampler);
2812 }
2813
2814 /**
2815 * Apply workarounds for Gen6 gather with UINT/SINT
2816 */
2817 void
2818 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2819 {
2820 if (!wa)
2821 return;
2822
2823 int width = (wa & WA_8BIT) ? 8 : 16;
2824 dst_reg dst_f = dst;
2825 dst_f.type = BRW_REGISTER_TYPE_F;
2826
2827 /* Convert from UNORM to UINT */
2828 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2829 emit(MOV(dst, src_reg(dst_f)));
2830
2831 if (wa & WA_SIGN) {
2832 /* Reinterpret the UINT value as a signed INT value by
2833 * shifting the sign bit into place, then shifting back
2834 * preserving sign.
2835 */
2836 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2837 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2838 }
2839 }
2840
2841 /**
2842 * Set up the gather channel based on the swizzle, for gather4.
2843 */
2844 uint32_t
2845 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2846 {
2847 ir_constant *chan = ir->lod_info.component->as_constant();
2848 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2849 switch (swiz) {
2850 case SWIZZLE_X: return 0;
2851 case SWIZZLE_Y:
2852 /* gather4 sampler is broken for green channel on RG32F --
2853 * we must ask for blue instead.
2854 */
2855 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2856 return 2;
2857 return 1;
2858 case SWIZZLE_Z: return 2;
2859 case SWIZZLE_W: return 3;
2860 default:
2861 unreachable("Not reached"); /* zero, one swizzles handled already */
2862 }
2863 }
2864
2865 void
2866 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2867 {
2868 int s = key->tex.swizzles[sampler];
2869
2870 this->result = src_reg(this, ir->type);
2871 dst_reg swizzled_result(this->result);
2872
2873 if (ir->op == ir_query_levels) {
2874 /* # levels is in .w */
2875 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2876 emit(MOV(swizzled_result, orig_val));
2877 return;
2878 }
2879
2880 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2881 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2882 emit(MOV(swizzled_result, orig_val));
2883 return;
2884 }
2885
2886
2887 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2888 int swizzle[4] = {0};
2889
2890 for (int i = 0; i < 4; i++) {
2891 switch (GET_SWZ(s, i)) {
2892 case SWIZZLE_ZERO:
2893 zero_mask |= (1 << i);
2894 break;
2895 case SWIZZLE_ONE:
2896 one_mask |= (1 << i);
2897 break;
2898 default:
2899 copy_mask |= (1 << i);
2900 swizzle[i] = GET_SWZ(s, i);
2901 break;
2902 }
2903 }
2904
2905 if (copy_mask) {
2906 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2907 swizzled_result.writemask = copy_mask;
2908 emit(MOV(swizzled_result, orig_val));
2909 }
2910
2911 if (zero_mask) {
2912 swizzled_result.writemask = zero_mask;
2913 emit(MOV(swizzled_result, src_reg(0.0f)));
2914 }
2915
2916 if (one_mask) {
2917 swizzled_result.writemask = one_mask;
2918 emit(MOV(swizzled_result, src_reg(1.0f)));
2919 }
2920 }
2921
2922 void
2923 vec4_visitor::visit(ir_return *)
2924 {
2925 unreachable("not reached");
2926 }
2927
2928 void
2929 vec4_visitor::visit(ir_discard *)
2930 {
2931 unreachable("not reached");
2932 }
2933
2934 void
2935 vec4_visitor::visit(ir_if *ir)
2936 {
2937 /* Don't point the annotation at the if statement, because then it plus
2938 * the then and else blocks get printed.
2939 */
2940 this->base_ir = ir->condition;
2941
2942 if (devinfo->gen == 6) {
2943 emit_if_gen6(ir);
2944 } else {
2945 enum brw_predicate predicate;
2946 emit_bool_to_cond_code(ir->condition, &predicate);
2947 emit(IF(predicate));
2948 }
2949
2950 visit_instructions(&ir->then_instructions);
2951
2952 if (!ir->else_instructions.is_empty()) {
2953 this->base_ir = ir->condition;
2954 emit(BRW_OPCODE_ELSE);
2955
2956 visit_instructions(&ir->else_instructions);
2957 }
2958
2959 this->base_ir = ir->condition;
2960 emit(BRW_OPCODE_ENDIF);
2961 }
2962
2963 void
2964 vec4_visitor::visit(ir_emit_vertex *)
2965 {
2966 unreachable("not reached");
2967 }
2968
2969 void
2970 vec4_visitor::visit(ir_end_primitive *)
2971 {
2972 unreachable("not reached");
2973 }
2974
2975 void
2976 vec4_visitor::visit(ir_barrier *)
2977 {
2978 unreachable("not reached");
2979 }
2980
2981 void
2982 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2983 dst_reg dst, src_reg offset,
2984 src_reg src0, src_reg src1)
2985 {
2986 unsigned mlen = 0;
2987
2988 /* Set the atomic operation offset. */
2989 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2990 mlen++;
2991
2992 /* Set the atomic operation arguments. */
2993 if (src0.file != BAD_FILE) {
2994 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2995 mlen++;
2996 }
2997
2998 if (src1.file != BAD_FILE) {
2999 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3000 mlen++;
3001 }
3002
3003 /* Emit the instruction. Note that this maps to the normal SIMD8
3004 * untyped atomic message on Ivy Bridge, but that's OK because
3005 * unused channels will be masked out.
3006 */
3007 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3008 brw_message_reg(0),
3009 src_reg(surf_index), src_reg(atomic_op));
3010 inst->mlen = mlen;
3011 }
3012
3013 void
3014 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3015 src_reg offset)
3016 {
3017 /* Set the surface read offset. */
3018 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3019
3020 /* Emit the instruction. Note that this maps to the normal SIMD8
3021 * untyped surface read message, but that's OK because unused
3022 * channels will be masked out.
3023 */
3024 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3025 brw_message_reg(0),
3026 src_reg(surf_index), src_reg(1));
3027 inst->mlen = 1;
3028 }
3029
3030 void
3031 vec4_visitor::emit_ndc_computation()
3032 {
3033 /* Get the position */
3034 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3035
3036 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3037 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3038 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3039
3040 current_annotation = "NDC";
3041 dst_reg ndc_w = ndc;
3042 ndc_w.writemask = WRITEMASK_W;
3043 src_reg pos_w = pos;
3044 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3045 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3046
3047 dst_reg ndc_xyz = ndc;
3048 ndc_xyz.writemask = WRITEMASK_XYZ;
3049
3050 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3051 }
3052
3053 void
3054 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3055 {
3056 if (devinfo->gen < 6 &&
3057 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3058 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3059 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3060 dst_reg header1_w = header1;
3061 header1_w.writemask = WRITEMASK_W;
3062
3063 emit(MOV(header1, 0u));
3064
3065 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3066 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3067
3068 current_annotation = "Point size";
3069 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3070 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3071 }
3072
3073 if (key->userclip_active) {
3074 current_annotation = "Clipping flags";
3075 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3076 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3077
3078 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3079 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3080 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3081
3082 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3083 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3084 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3085 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3086 }
3087
3088 /* i965 clipping workaround:
3089 * 1) Test for -ve rhw
3090 * 2) If set,
3091 * set ndc = (0,0,0,0)
3092 * set ucp[6] = 1
3093 *
3094 * Later, clipping will detect ucp[6] and ensure the primitive is
3095 * clipped against all fixed planes.
3096 */
3097 if (devinfo->has_negative_rhw_bug) {
3098 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3099 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3100 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3101 vec4_instruction *inst;
3102 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3103 inst->predicate = BRW_PREDICATE_NORMAL;
3104 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3105 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3106 inst->predicate = BRW_PREDICATE_NORMAL;
3107 }
3108
3109 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3110 } else if (devinfo->gen < 6) {
3111 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3112 } else {
3113 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3114 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3115 dst_reg reg_w = reg;
3116 reg_w.writemask = WRITEMASK_W;
3117 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3118 reg_as_src.type = reg_w.type;
3119 reg_as_src.swizzle = brw_swizzle_for_size(1);
3120 emit(MOV(reg_w, reg_as_src));
3121 }
3122 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3123 dst_reg reg_y = reg;
3124 reg_y.writemask = WRITEMASK_Y;
3125 reg_y.type = BRW_REGISTER_TYPE_D;
3126 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3127 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3128 }
3129 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3130 dst_reg reg_z = reg;
3131 reg_z.writemask = WRITEMASK_Z;
3132 reg_z.type = BRW_REGISTER_TYPE_D;
3133 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3134 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3135 }
3136 }
3137 }
3138
3139 void
3140 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3141 {
3142 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3143 *
3144 * "If a linked set of shaders forming the vertex stage contains no
3145 * static write to gl_ClipVertex or gl_ClipDistance, but the
3146 * application has requested clipping against user clip planes through
3147 * the API, then the coordinate written to gl_Position is used for
3148 * comparison against the user clip planes."
3149 *
3150 * This function is only called if the shader didn't write to
3151 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3152 * if the user wrote to it; otherwise we use gl_Position.
3153 */
3154 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3155 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3156 clip_vertex = VARYING_SLOT_POS;
3157 }
3158
3159 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3160 ++i) {
3161 reg.writemask = 1 << i;
3162 emit(DP4(reg,
3163 src_reg(output_reg[clip_vertex]),
3164 src_reg(this->userplane[i + offset])));
3165 }
3166 }
3167
3168 vec4_instruction *
3169 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3170 {
3171 assert(varying < VARYING_SLOT_MAX);
3172 assert(output_reg[varying].type == reg.type);
3173 current_annotation = output_reg_annotation[varying];
3174 /* Copy the register, saturating if necessary */
3175 return emit(MOV(reg, src_reg(output_reg[varying])));
3176 }
3177
3178 void
3179 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3180 {
3181 reg.type = BRW_REGISTER_TYPE_F;
3182 output_reg[varying].type = reg.type;
3183
3184 switch (varying) {
3185 case VARYING_SLOT_PSIZ:
3186 {
3187 /* PSIZ is always in slot 0, and is coupled with other flags. */
3188 current_annotation = "indices, point width, clip flags";
3189 emit_psiz_and_flags(reg);
3190 break;
3191 }
3192 case BRW_VARYING_SLOT_NDC:
3193 current_annotation = "NDC";
3194 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3195 break;
3196 case VARYING_SLOT_POS:
3197 current_annotation = "gl_Position";
3198 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3199 break;
3200 case VARYING_SLOT_EDGE:
3201 /* This is present when doing unfilled polygons. We're supposed to copy
3202 * the edge flag from the user-provided vertex array
3203 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3204 * of that attribute (starts as 1.0f). This is then used in clipping to
3205 * determine which edges should be drawn as wireframe.
3206 */
3207 current_annotation = "edge flag";
3208 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3209 glsl_type::float_type, WRITEMASK_XYZW))));
3210 break;
3211 case BRW_VARYING_SLOT_PAD:
3212 /* No need to write to this slot */
3213 break;
3214 case VARYING_SLOT_COL0:
3215 case VARYING_SLOT_COL1:
3216 case VARYING_SLOT_BFC0:
3217 case VARYING_SLOT_BFC1: {
3218 /* These built-in varyings are only supported in compatibility mode,
3219 * and we only support GS in core profile. So, this must be a vertex
3220 * shader.
3221 */
3222 assert(stage == MESA_SHADER_VERTEX);
3223 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3224 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3225 inst->saturate = true;
3226 break;
3227 }
3228
3229 default:
3230 emit_generic_urb_slot(reg, varying);
3231 break;
3232 }
3233 }
3234
3235 static int
3236 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3237 {
3238 if (devinfo->gen >= 6) {
3239 /* URB data written (does not include the message header reg) must
3240 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3241 * section 5.4.3.2.2: URB_INTERLEAVED.
3242 *
3243 * URB entries are allocated on a multiple of 1024 bits, so an
3244 * extra 128 bits written here to make the end align to 256 is
3245 * no problem.
3246 */
3247 if ((mlen % 2) != 1)
3248 mlen++;
3249 }
3250
3251 return mlen;
3252 }
3253
3254
3255 /**
3256 * Generates the VUE payload plus the necessary URB write instructions to
3257 * output it.
3258 *
3259 * The VUE layout is documented in Volume 2a.
3260 */
3261 void
3262 vec4_visitor::emit_vertex()
3263 {
3264 /* MRF 0 is reserved for the debugger, so start with message header
3265 * in MRF 1.
3266 */
3267 int base_mrf = 1;
3268 int mrf = base_mrf;
3269 /* In the process of generating our URB write message contents, we
3270 * may need to unspill a register or load from an array. Those
3271 * reads would use MRFs 14-15.
3272 */
3273 int max_usable_mrf = 13;
3274
3275 /* The following assertion verifies that max_usable_mrf causes an
3276 * even-numbered amount of URB write data, which will meet gen6's
3277 * requirements for length alignment.
3278 */
3279 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3280
3281 /* First mrf is the g0-based message header containing URB handles and
3282 * such.
3283 */
3284 emit_urb_write_header(mrf++);
3285
3286 if (devinfo->gen < 6) {
3287 emit_ndc_computation();
3288 }
3289
3290 /* Lower legacy ff and ClipVertex clipping to clip distances */
3291 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3292 current_annotation = "user clip distances";
3293
3294 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3295 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3296
3297 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3298 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3299 }
3300
3301 /* We may need to split this up into several URB writes, so do them in a
3302 * loop.
3303 */
3304 int slot = 0;
3305 bool complete = false;
3306 do {
3307 /* URB offset is in URB row increments, and each of our MRFs is half of
3308 * one of those, since we're doing interleaved writes.
3309 */
3310 int offset = slot / 2;
3311
3312 mrf = base_mrf + 1;
3313 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3314 emit_urb_slot(dst_reg(MRF, mrf++),
3315 prog_data->vue_map.slot_to_varying[slot]);
3316
3317 /* If this was max_usable_mrf, we can't fit anything more into this
3318 * URB WRITE.
3319 */
3320 if (mrf > max_usable_mrf) {
3321 slot++;
3322 break;
3323 }
3324 }
3325
3326 complete = slot >= prog_data->vue_map.num_slots;
3327 current_annotation = "URB write";
3328 vec4_instruction *inst = emit_urb_write_opcode(complete);
3329 inst->base_mrf = base_mrf;
3330 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3331 inst->offset += offset;
3332 } while(!complete);
3333 }
3334
3335
3336 src_reg
3337 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3338 src_reg *reladdr, int reg_offset)
3339 {
3340 /* Because we store the values to scratch interleaved like our
3341 * vertex data, we need to scale the vec4 index by 2.
3342 */
3343 int message_header_scale = 2;
3344
3345 /* Pre-gen6, the message header uses byte offsets instead of vec4
3346 * (16-byte) offset units.
3347 */
3348 if (devinfo->gen < 6)
3349 message_header_scale *= 16;
3350
3351 if (reladdr) {
3352 src_reg index = src_reg(this, glsl_type::int_type);
3353
3354 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3355 src_reg(reg_offset)));
3356 emit_before(block, inst, MUL(dst_reg(index), index,
3357 src_reg(message_header_scale)));
3358
3359 return index;
3360 } else {
3361 return src_reg(reg_offset * message_header_scale);
3362 }
3363 }
3364
3365 src_reg
3366 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3367 src_reg *reladdr, int reg_offset)
3368 {
3369 if (reladdr) {
3370 src_reg index = src_reg(this, glsl_type::int_type);
3371
3372 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3373 src_reg(reg_offset)));
3374
3375 /* Pre-gen6, the message header uses byte offsets instead of vec4
3376 * (16-byte) offset units.
3377 */
3378 if (devinfo->gen < 6) {
3379 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3380 }
3381
3382 return index;
3383 } else if (devinfo->gen >= 8) {
3384 /* Store the offset in a GRF so we can send-from-GRF. */
3385 src_reg offset = src_reg(this, glsl_type::int_type);
3386 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3387 return offset;
3388 } else {
3389 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3390 return src_reg(reg_offset * message_header_scale);
3391 }
3392 }
3393
3394 /**
3395 * Emits an instruction before @inst to load the value named by @orig_src
3396 * from scratch space at @base_offset to @temp.
3397 *
3398 * @base_offset is measured in 32-byte units (the size of a register).
3399 */
3400 void
3401 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3402 dst_reg temp, src_reg orig_src,
3403 int base_offset)
3404 {
3405 int reg_offset = base_offset + orig_src.reg_offset;
3406 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3407 reg_offset);
3408
3409 emit_before(block, inst, SCRATCH_READ(temp, index));
3410 }
3411
3412 /**
3413 * Emits an instruction after @inst to store the value to be written
3414 * to @orig_dst to scratch space at @base_offset, from @temp.
3415 *
3416 * @base_offset is measured in 32-byte units (the size of a register).
3417 */
3418 void
3419 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3420 int base_offset)
3421 {
3422 int reg_offset = base_offset + inst->dst.reg_offset;
3423 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3424 reg_offset);
3425
3426 /* Create a temporary register to store *inst's result in.
3427 *
3428 * We have to be careful in MOVing from our temporary result register in
3429 * the scratch write. If we swizzle from channels of the temporary that
3430 * weren't initialized, it will confuse live interval analysis, which will
3431 * make spilling fail to make progress.
3432 */
3433 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3434 inst->dst.type),
3435 brw_swizzle_for_mask(inst->dst.writemask));
3436 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3437 inst->dst.writemask));
3438 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3439 write->predicate = inst->predicate;
3440 write->ir = inst->ir;
3441 write->annotation = inst->annotation;
3442 inst->insert_after(block, write);
3443
3444 inst->dst.file = temp.file;
3445 inst->dst.reg = temp.reg;
3446 inst->dst.reg_offset = temp.reg_offset;
3447 inst->dst.reladdr = NULL;
3448 }
3449
3450 /**
3451 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3452 * adds the scratch read(s) before \p inst. The function also checks for
3453 * recursive reladdr scratch accesses, issuing the corresponding scratch
3454 * loads and rewriting reladdr references accordingly.
3455 *
3456 * \return \p src if it did not require a scratch load, otherwise, the
3457 * register holding the result of the scratch load that the caller should
3458 * use to rewrite src.
3459 */
3460 src_reg
3461 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3462 vec4_instruction *inst, src_reg src)
3463 {
3464 /* Resolve recursive reladdr scratch access by calling ourselves
3465 * with src.reladdr
3466 */
3467 if (src.reladdr)
3468 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3469 *src.reladdr);
3470
3471 /* Now handle scratch access on src */
3472 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3473 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3474 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3475 src.reg = temp.reg;
3476 src.reg_offset = temp.reg_offset;
3477 src.reladdr = NULL;
3478 }
3479
3480 return src;
3481 }
3482
3483 /**
3484 * We can't generally support array access in GRF space, because a
3485 * single instruction's destination can only span 2 contiguous
3486 * registers. So, we send all GRF arrays that get variable index
3487 * access to scratch space.
3488 */
3489 void
3490 vec4_visitor::move_grf_array_access_to_scratch()
3491 {
3492 int scratch_loc[this->alloc.count];
3493 memset(scratch_loc, -1, sizeof(scratch_loc));
3494
3495 /* First, calculate the set of virtual GRFs that need to be punted
3496 * to scratch due to having any array access on them, and where in
3497 * scratch.
3498 */
3499 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3500 if (inst->dst.file == GRF && inst->dst.reladdr) {
3501 if (scratch_loc[inst->dst.reg] == -1) {
3502 scratch_loc[inst->dst.reg] = last_scratch;
3503 last_scratch += this->alloc.sizes[inst->dst.reg];
3504 }
3505
3506 for (src_reg *iter = inst->dst.reladdr;
3507 iter->reladdr;
3508 iter = iter->reladdr) {
3509 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3510 scratch_loc[iter->reg] = last_scratch;
3511 last_scratch += this->alloc.sizes[iter->reg];
3512 }
3513 }
3514 }
3515
3516 for (int i = 0 ; i < 3; i++) {
3517 for (src_reg *iter = &inst->src[i];
3518 iter->reladdr;
3519 iter = iter->reladdr) {
3520 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3521 scratch_loc[iter->reg] = last_scratch;
3522 last_scratch += this->alloc.sizes[iter->reg];
3523 }
3524 }
3525 }
3526 }
3527
3528 /* Now, for anything that will be accessed through scratch, rewrite
3529 * it to load/store. Note that this is a _safe list walk, because
3530 * we may generate a new scratch_write instruction after the one
3531 * we're processing.
3532 */
3533 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3534 /* Set up the annotation tracking for new generated instructions. */
3535 base_ir = inst->ir;
3536 current_annotation = inst->annotation;
3537
3538 /* First handle scratch access on the dst. Notice we have to handle
3539 * the case where the dst's reladdr also points to scratch space.
3540 */
3541 if (inst->dst.reladdr)
3542 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3543 *inst->dst.reladdr);
3544
3545 /* Now that we have handled any (possibly recursive) reladdr scratch
3546 * accesses for dst we can safely do the scratch write for dst itself
3547 */
3548 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3549 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3550
3551 /* Now handle scratch access on any src. In this case, since inst->src[i]
3552 * already is a src_reg, we can just call emit_resolve_reladdr with
3553 * inst->src[i] and it will take care of handling scratch loads for
3554 * both src and src.reladdr (recursively).
3555 */
3556 for (int i = 0 ; i < 3; i++) {
3557 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3558 inst->src[i]);
3559 }
3560 }
3561 }
3562
3563 /**
3564 * Emits an instruction before @inst to load the value named by @orig_src
3565 * from the pull constant buffer (surface) at @base_offset to @temp.
3566 */
3567 void
3568 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3569 dst_reg temp, src_reg orig_src,
3570 int base_offset)
3571 {
3572 int reg_offset = base_offset + orig_src.reg_offset;
3573 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3574 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3575 reg_offset);
3576
3577 emit_pull_constant_load_reg(temp,
3578 index,
3579 offset,
3580 block, inst);
3581 }
3582
3583 /**
3584 * Implements array access of uniforms by inserting a
3585 * PULL_CONSTANT_LOAD instruction.
3586 *
3587 * Unlike temporary GRF array access (where we don't support it due to
3588 * the difficulty of doing relative addressing on instruction
3589 * destinations), we could potentially do array access of uniforms
3590 * that were loaded in GRF space as push constants. In real-world
3591 * usage we've seen, though, the arrays being used are always larger
3592 * than we could load as push constants, so just always move all
3593 * uniform array access out to a pull constant buffer.
3594 */
3595 void
3596 vec4_visitor::move_uniform_array_access_to_pull_constants()
3597 {
3598 int pull_constant_loc[this->uniforms];
3599 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3600 bool nested_reladdr;
3601
3602 /* Walk through and find array access of uniforms. Put a copy of that
3603 * uniform in the pull constant buffer.
3604 *
3605 * Note that we don't move constant-indexed accesses to arrays. No
3606 * testing has been done of the performance impact of this choice.
3607 */
3608 do {
3609 nested_reladdr = false;
3610
3611 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3612 for (int i = 0 ; i < 3; i++) {
3613 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3614 continue;
3615
3616 int uniform = inst->src[i].reg;
3617
3618 if (inst->src[i].reladdr->reladdr)
3619 nested_reladdr = true; /* will need another pass */
3620
3621 /* If this array isn't already present in the pull constant buffer,
3622 * add it.
3623 */
3624 if (pull_constant_loc[uniform] == -1) {
3625 const gl_constant_value **values =
3626 &stage_prog_data->param[uniform * 4];
3627
3628 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3629
3630 assert(uniform < uniform_array_size);
3631 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3632 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3633 = values[j];
3634 }
3635 }
3636
3637 /* Set up the annotation tracking for new generated instructions. */
3638 base_ir = inst->ir;
3639 current_annotation = inst->annotation;
3640
3641 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3642
3643 emit_pull_constant_load(block, inst, temp, inst->src[i],
3644 pull_constant_loc[uniform]);
3645
3646 inst->src[i].file = temp.file;
3647 inst->src[i].reg = temp.reg;
3648 inst->src[i].reg_offset = temp.reg_offset;
3649 inst->src[i].reladdr = NULL;
3650 }
3651 }
3652 } while (nested_reladdr);
3653
3654 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3655 * no need to track them as larger-than-vec4 objects. This will be
3656 * relied on in cutting out unused uniform vectors from push
3657 * constants.
3658 */
3659 split_uniform_registers();
3660 }
3661
3662 void
3663 vec4_visitor::resolve_ud_negate(src_reg *reg)
3664 {
3665 if (reg->type != BRW_REGISTER_TYPE_UD ||
3666 !reg->negate)
3667 return;
3668
3669 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3670 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3671 *reg = temp;
3672 }
3673
3674 /**
3675 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3676 *
3677 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3678 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3679 */
3680 void
3681 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3682 {
3683 assert(devinfo->gen <= 5);
3684
3685 if (!rvalue->type->is_boolean())
3686 return;
3687
3688 src_reg and_result = src_reg(this, rvalue->type);
3689 src_reg neg_result = src_reg(this, rvalue->type);
3690 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3691 emit(MOV(dst_reg(neg_result), negate(and_result)));
3692 *reg = neg_result;
3693 }
3694
3695 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3696 void *log_data,
3697 struct gl_program *prog,
3698 const struct brw_vue_prog_key *key,
3699 struct brw_vue_prog_data *prog_data,
3700 struct gl_shader_program *shader_prog,
3701 gl_shader_stage stage,
3702 void *mem_ctx,
3703 bool no_spills,
3704 int shader_time_index)
3705 : backend_shader(compiler, log_data, mem_ctx,
3706 shader_prog, prog, &prog_data->base, stage),
3707 key(key),
3708 prog_data(prog_data),
3709 sanity_param_count(0),
3710 fail_msg(NULL),
3711 first_non_payload_grf(0),
3712 need_all_constants_in_pull_buffer(false),
3713 no_spills(no_spills),
3714 shader_time_index(shader_time_index),
3715 last_scratch(0)
3716 {
3717 this->failed = false;
3718
3719 this->base_ir = NULL;
3720 this->current_annotation = NULL;
3721 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3722
3723 this->variable_ht = hash_table_ctor(0,
3724 hash_table_pointer_hash,
3725 hash_table_pointer_compare);
3726
3727 this->virtual_grf_start = NULL;
3728 this->virtual_grf_end = NULL;
3729 this->live_intervals = NULL;
3730
3731 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3732
3733 this->uniforms = 0;
3734
3735 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3736 * at least one. See setup_uniforms() in brw_vec4.cpp.
3737 */
3738 this->uniform_array_size = 1;
3739 if (prog_data) {
3740 this->uniform_array_size =
3741 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3742 }
3743
3744 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3745 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3746 }
3747
3748 vec4_visitor::~vec4_visitor()
3749 {
3750 hash_table_dtor(this->variable_ht);
3751 }
3752
3753
3754 void
3755 vec4_visitor::fail(const char *format, ...)
3756 {
3757 va_list va;
3758 char *msg;
3759
3760 if (failed)
3761 return;
3762
3763 failed = true;
3764
3765 va_start(va, format);
3766 msg = ralloc_vasprintf(mem_ctx, format, va);
3767 va_end(va);
3768 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3769
3770 this->fail_msg = msg;
3771
3772 if (debug_enabled) {
3773 fprintf(stderr, "%s", msg);
3774 }
3775 }
3776
3777 } /* namespace brw */