737c9fa6d27f32696453c40e6996dcca922178a5
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (devinfo->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 vec4_instruction *
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 math = emit(MOV(dst, src_reg(math->dst)));
354 } else if (devinfo->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358
359 return math;
360 }
361
362 void
363 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
364 {
365 if (devinfo->gen < 7) {
366 unreachable("ir_unop_pack_half_2x16 should be lowered");
367 }
368
369 assert(dst.type == BRW_REGISTER_TYPE_UD);
370 assert(src0.type == BRW_REGISTER_TYPE_F);
371
372 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
373 *
374 * Because this instruction does not have a 16-bit floating-point type,
375 * the destination data type must be Word (W).
376 *
377 * The destination must be DWord-aligned and specify a horizontal stride
378 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
379 * each destination channel and the upper word is not modified.
380 *
381 * The above restriction implies that the f32to16 instruction must use
382 * align1 mode, because only in align1 mode is it possible to specify
383 * horizontal stride. We choose here to defy the hardware docs and emit
384 * align16 instructions.
385 *
386 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
387 * instructions. I was partially successful in that the code passed all
388 * tests. However, the code was dubiously correct and fragile, and the
389 * tests were not harsh enough to probe that frailty. Not trusting the
390 * code, I chose instead to remain in align16 mode in defiance of the hw
391 * docs).
392 *
393 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
394 * simulator, emitting a f32to16 in align16 mode with UD as destination
395 * data type is safe. The behavior differs from that specified in the PRM
396 * in that the upper word of each destination channel is cleared to 0.
397 */
398
399 dst_reg tmp_dst(this, glsl_type::uvec2_type);
400 src_reg tmp_src(tmp_dst);
401
402 #if 0
403 /* Verify the undocumented behavior on which the following instructions
404 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
405 * then the result of the bit-or instruction below will be incorrect.
406 *
407 * You should inspect the disasm output in order to verify that the MOV is
408 * not optimized away.
409 */
410 emit(MOV(tmp_dst, src_reg(0x12345678u)));
411 #endif
412
413 /* Give tmp the form below, where "." means untouched.
414 *
415 * w z y x w z y x
416 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
417 *
418 * That the upper word of each write-channel be 0 is required for the
419 * following bit-shift and bit-or instructions to work. Note that this
420 * relies on the undocumented hardware behavior mentioned above.
421 */
422 tmp_dst.writemask = WRITEMASK_XY;
423 emit(F32TO16(tmp_dst, src0));
424
425 /* Give the write-channels of dst the form:
426 * 0xhhhh0000
427 */
428 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
429 emit(SHL(dst, tmp_src, src_reg(16u)));
430
431 /* Finally, give the write-channels of dst the form of packHalf2x16's
432 * output:
433 * 0xhhhhllll
434 */
435 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
436 emit(OR(dst, src_reg(dst), tmp_src));
437 }
438
439 void
440 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
441 {
442 if (devinfo->gen < 7) {
443 unreachable("ir_unop_unpack_half_2x16 should be lowered");
444 }
445
446 assert(dst.type == BRW_REGISTER_TYPE_F);
447 assert(src0.type == BRW_REGISTER_TYPE_UD);
448
449 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
450 *
451 * Because this instruction does not have a 16-bit floating-point type,
452 * the source data type must be Word (W). The destination type must be
453 * F (Float).
454 *
455 * To use W as the source data type, we must adjust horizontal strides,
456 * which is only possible in align1 mode. All my [chadv] attempts at
457 * emitting align1 instructions for unpackHalf2x16 failed to pass the
458 * Piglit tests, so I gave up.
459 *
460 * I've verified that, on gen7 hardware and the simulator, it is safe to
461 * emit f16to32 in align16 mode with UD as source data type.
462 */
463
464 dst_reg tmp_dst(this, glsl_type::uvec2_type);
465 src_reg tmp_src(tmp_dst);
466
467 tmp_dst.writemask = WRITEMASK_X;
468 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
469
470 tmp_dst.writemask = WRITEMASK_Y;
471 emit(SHR(tmp_dst, src0, src_reg(16u)));
472
473 dst.writemask = WRITEMASK_XY;
474 emit(F16TO32(dst, tmp_src));
475 }
476
477 void
478 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
479 {
480 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
481 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
482 * is not suitable to generate the shift values, but we can use the packed
483 * vector float and a type-converting MOV.
484 */
485 dst_reg shift(this, glsl_type::uvec4_type);
486 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
487
488 dst_reg shifted(this, glsl_type::uvec4_type);
489 src0.swizzle = BRW_SWIZZLE_XXXX;
490 emit(SHR(shifted, src0, src_reg(shift)));
491
492 shifted.type = BRW_REGISTER_TYPE_UB;
493 dst_reg f(this, glsl_type::vec4_type);
494 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
495
496 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
497 }
498
499 void
500 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
501 {
502 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
503 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
504 * is not suitable to generate the shift values, but we can use the packed
505 * vector float and a type-converting MOV.
506 */
507 dst_reg shift(this, glsl_type::uvec4_type);
508 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
509
510 dst_reg shifted(this, glsl_type::uvec4_type);
511 src0.swizzle = BRW_SWIZZLE_XXXX;
512 emit(SHR(shifted, src0, src_reg(shift)));
513
514 shifted.type = BRW_REGISTER_TYPE_B;
515 dst_reg f(this, glsl_type::vec4_type);
516 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
517
518 dst_reg scaled(this, glsl_type::vec4_type);
519 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
520
521 dst_reg max(this, glsl_type::vec4_type);
522 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
523 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
524 }
525
526 void
527 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
528 {
529 dst_reg saturated(this, glsl_type::vec4_type);
530 vec4_instruction *inst = emit(MOV(saturated, src0));
531 inst->saturate = true;
532
533 dst_reg scaled(this, glsl_type::vec4_type);
534 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
535
536 dst_reg rounded(this, glsl_type::vec4_type);
537 emit(RNDE(rounded, src_reg(scaled)));
538
539 dst_reg u(this, glsl_type::uvec4_type);
540 emit(MOV(u, src_reg(rounded)));
541
542 src_reg bytes(u);
543 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
544 }
545
546 void
547 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
548 {
549 dst_reg max(this, glsl_type::vec4_type);
550 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
551
552 dst_reg min(this, glsl_type::vec4_type);
553 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
554
555 dst_reg scaled(this, glsl_type::vec4_type);
556 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
557
558 dst_reg rounded(this, glsl_type::vec4_type);
559 emit(RNDE(rounded, src_reg(scaled)));
560
561 dst_reg i(this, glsl_type::ivec4_type);
562 emit(MOV(i, src_reg(rounded)));
563
564 src_reg bytes(i);
565 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
566 }
567
568 void
569 vec4_visitor::visit_instructions(const exec_list *list)
570 {
571 foreach_in_list(ir_instruction, ir, list) {
572 base_ir = ir;
573 ir->accept(this);
574 }
575 }
576
577 /**
578 * Returns the minimum number of vec4 elements needed to pack a type.
579 *
580 * For simple types, it will return 1 (a single vec4); for matrices, the
581 * number of columns; for array and struct, the sum of the vec4_size of
582 * each of its elements; and for sampler and atomic, zero.
583 *
584 * This method is useful to calculate how much register space is needed to
585 * store a particular type.
586 */
587 int
588 vec4_visitor::type_size(const struct glsl_type *type)
589 {
590 unsigned int i;
591 int size;
592
593 switch (type->base_type) {
594 case GLSL_TYPE_UINT:
595 case GLSL_TYPE_INT:
596 case GLSL_TYPE_FLOAT:
597 case GLSL_TYPE_BOOL:
598 if (type->is_matrix()) {
599 return type->matrix_columns;
600 } else {
601 /* Regardless of size of vector, it gets a vec4. This is bad
602 * packing for things like floats, but otherwise arrays become a
603 * mess. Hopefully a later pass over the code can pack scalars
604 * down if appropriate.
605 */
606 return 1;
607 }
608 case GLSL_TYPE_ARRAY:
609 assert(type->length > 0);
610 return type_size(type->fields.array) * type->length;
611 case GLSL_TYPE_STRUCT:
612 size = 0;
613 for (i = 0; i < type->length; i++) {
614 size += type_size(type->fields.structure[i].type);
615 }
616 return size;
617 case GLSL_TYPE_SUBROUTINE:
618 return 1;
619
620 case GLSL_TYPE_SAMPLER:
621 /* Samplers take up no register space, since they're baked in at
622 * link time.
623 */
624 return 0;
625 case GLSL_TYPE_ATOMIC_UINT:
626 return 0;
627 case GLSL_TYPE_IMAGE:
628 case GLSL_TYPE_VOID:
629 case GLSL_TYPE_DOUBLE:
630 case GLSL_TYPE_ERROR:
631 case GLSL_TYPE_INTERFACE:
632 unreachable("not reached");
633 }
634
635 return 0;
636 }
637
638 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
639 {
640 init();
641
642 this->file = GRF;
643 this->reg = v->alloc.allocate(v->type_size(type));
644
645 if (type->is_array() || type->is_record()) {
646 this->swizzle = BRW_SWIZZLE_NOOP;
647 } else {
648 this->swizzle = brw_swizzle_for_size(type->vector_elements);
649 }
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
655 {
656 assert(size > 0);
657
658 init();
659
660 this->file = GRF;
661 this->reg = v->alloc.allocate(v->type_size(type) * size);
662
663 this->swizzle = BRW_SWIZZLE_NOOP;
664
665 this->type = brw_type_for_base_type(type);
666 }
667
668 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
669 {
670 init();
671
672 this->file = GRF;
673 this->reg = v->alloc.allocate(v->type_size(type));
674
675 if (type->is_array() || type->is_record()) {
676 this->writemask = WRITEMASK_XYZW;
677 } else {
678 this->writemask = (1 << type->vector_elements) - 1;
679 }
680
681 this->type = brw_type_for_base_type(type);
682 }
683
684 void
685 vec4_visitor::setup_vector_uniform_values(const gl_constant_value *values,
686 unsigned n)
687 {
688 static const gl_constant_value zero = { 0 };
689
690 for (unsigned i = 0; i < n; ++i)
691 stage_prog_data->param[4 * uniforms + i] = &values[i];
692
693 for (unsigned i = n; i < 4; ++i)
694 stage_prog_data->param[4 * uniforms + i] = &zero;
695
696 uniform_vector_size[uniforms++] = n;
697 }
698
699 /* Our support for uniforms is piggy-backed on the struct
700 * gl_fragment_program, because that's where the values actually
701 * get stored, rather than in some global gl_shader_program uniform
702 * store.
703 */
704 void
705 vec4_visitor::setup_uniform_values(ir_variable *ir)
706 {
707 int namelen = strlen(ir->name);
708
709 /* The data for our (non-builtin) uniforms is stored in a series of
710 * gl_uniform_driver_storage structs for each subcomponent that
711 * glGetUniformLocation() could name. We know it's been set up in the same
712 * order we'd walk the type, so walk the list of storage and find anything
713 * with our name, or the prefix of a component that starts with our name.
714 */
715 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
716 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
717
718 if (storage->builtin)
719 continue;
720
721 if (strncmp(ir->name, storage->name, namelen) != 0 ||
722 (storage->name[namelen] != 0 &&
723 storage->name[namelen] != '.' &&
724 storage->name[namelen] != '[')) {
725 continue;
726 }
727
728 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
729 storage->type->matrix_columns);
730 const unsigned vector_size = storage->type->vector_elements;
731
732 for (unsigned s = 0; s < vector_count; s++)
733 setup_vector_uniform_values(&storage->storage[s * vector_size],
734 vector_size);
735 }
736 }
737
738 void
739 vec4_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
740 {
741 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
742 assert(this->uniforms < uniform_array_size);
743 this->uniform_vector_size[this->uniforms] = 4;
744 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
745 this->userplane[i].type = BRW_REGISTER_TYPE_F;
746 for (int j = 0; j < 4; ++j) {
747 stage_prog_data->param[this->uniforms * 4 + j] =
748 (gl_constant_value *) &clip_planes[i][j];
749 }
750 ++this->uniforms;
751 }
752 }
753
754 /* Our support for builtin uniforms is even scarier than non-builtin.
755 * It sits on top of the PROG_STATE_VAR parameters that are
756 * automatically updated from GL context state.
757 */
758 void
759 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
760 {
761 const ir_state_slot *const slots = ir->get_state_slots();
762 assert(slots != NULL);
763
764 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
765 /* This state reference has already been setup by ir_to_mesa,
766 * but we'll get the same index back here. We can reference
767 * ParameterValues directly, since unlike brw_fs.cpp, we never
768 * add new state references during compile.
769 */
770 int index = _mesa_add_state_reference(this->prog->Parameters,
771 (gl_state_index *)slots[i].tokens);
772 gl_constant_value *values =
773 &this->prog->Parameters->ParameterValues[index][0];
774
775 assert(this->uniforms < uniform_array_size);
776
777 for (unsigned j = 0; j < 4; j++)
778 stage_prog_data->param[this->uniforms * 4 + j] =
779 &values[GET_SWZ(slots[i].swizzle, j)];
780
781 this->uniform_vector_size[this->uniforms] =
782 (ir->type->is_scalar() || ir->type->is_vector() ||
783 ir->type->is_matrix() ? ir->type->vector_elements : 4);
784
785 this->uniforms++;
786 }
787 }
788
789 dst_reg *
790 vec4_visitor::variable_storage(ir_variable *var)
791 {
792 return (dst_reg *)hash_table_find(this->variable_ht, var);
793 }
794
795 void
796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
797 enum brw_predicate *predicate)
798 {
799 ir_expression *expr = ir->as_expression();
800
801 *predicate = BRW_PREDICATE_NORMAL;
802
803 if (expr && expr->operation != ir_binop_ubo_load) {
804 src_reg op[3];
805 vec4_instruction *inst;
806
807 assert(expr->get_num_operands() <= 3);
808 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
809 expr->operands[i]->accept(this);
810 op[i] = this->result;
811
812 resolve_ud_negate(&op[i]);
813 }
814
815 switch (expr->operation) {
816 case ir_unop_logic_not:
817 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
818 inst->conditional_mod = BRW_CONDITIONAL_Z;
819 break;
820
821 case ir_binop_logic_xor:
822 if (devinfo->gen <= 5) {
823 src_reg temp = src_reg(this, ir->type);
824 emit(XOR(dst_reg(temp), op[0], op[1]));
825 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
826 } else {
827 inst = emit(XOR(dst_null_d(), op[0], op[1]));
828 }
829 inst->conditional_mod = BRW_CONDITIONAL_NZ;
830 break;
831
832 case ir_binop_logic_or:
833 if (devinfo->gen <= 5) {
834 src_reg temp = src_reg(this, ir->type);
835 emit(OR(dst_reg(temp), op[0], op[1]));
836 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
837 } else {
838 inst = emit(OR(dst_null_d(), op[0], op[1]));
839 }
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 break;
842
843 case ir_binop_logic_and:
844 if (devinfo->gen <= 5) {
845 src_reg temp = src_reg(this, ir->type);
846 emit(AND(dst_reg(temp), op[0], op[1]));
847 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
848 } else {
849 inst = emit(AND(dst_null_d(), op[0], op[1]));
850 }
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 break;
853
854 case ir_unop_f2b:
855 if (devinfo->gen >= 6) {
856 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
857 } else {
858 inst = emit(MOV(dst_null_f(), op[0]));
859 inst->conditional_mod = BRW_CONDITIONAL_NZ;
860 }
861 break;
862
863 case ir_unop_i2b:
864 if (devinfo->gen >= 6) {
865 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
866 } else {
867 inst = emit(MOV(dst_null_d(), op[0]));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 }
870 break;
871
872 case ir_binop_all_equal:
873 if (devinfo->gen <= 5) {
874 resolve_bool_comparison(expr->operands[0], &op[0]);
875 resolve_bool_comparison(expr->operands[1], &op[1]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
878 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
879 break;
880
881 case ir_binop_any_nequal:
882 if (devinfo->gen <= 5) {
883 resolve_bool_comparison(expr->operands[0], &op[0]);
884 resolve_bool_comparison(expr->operands[1], &op[1]);
885 }
886 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
887 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
888 break;
889
890 case ir_unop_any:
891 if (devinfo->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 }
894 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
895 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
896 break;
897
898 case ir_binop_greater:
899 case ir_binop_gequal:
900 case ir_binop_less:
901 case ir_binop_lequal:
902 case ir_binop_equal:
903 case ir_binop_nequal:
904 if (devinfo->gen <= 5) {
905 resolve_bool_comparison(expr->operands[0], &op[0]);
906 resolve_bool_comparison(expr->operands[1], &op[1]);
907 }
908 emit(CMP(dst_null_d(), op[0], op[1],
909 brw_conditional_for_comparison(expr->operation)));
910 break;
911
912 case ir_triop_csel: {
913 /* Expand the boolean condition into the flag register. */
914 inst = emit(MOV(dst_null_d(), op[0]));
915 inst->conditional_mod = BRW_CONDITIONAL_NZ;
916
917 /* Select which boolean to return. */
918 dst_reg temp(this, expr->operands[1]->type);
919 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
920 inst->predicate = BRW_PREDICATE_NORMAL;
921
922 /* Expand the result to a condition code. */
923 inst = emit(MOV(dst_null_d(), src_reg(temp)));
924 inst->conditional_mod = BRW_CONDITIONAL_NZ;
925 break;
926 }
927
928 default:
929 unreachable("not reached");
930 }
931 return;
932 }
933
934 ir->accept(this);
935
936 resolve_ud_negate(&this->result);
937
938 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
939 inst->conditional_mod = BRW_CONDITIONAL_NZ;
940 }
941
942 /**
943 * Emit a gen6 IF statement with the comparison folded into the IF
944 * instruction.
945 */
946 void
947 vec4_visitor::emit_if_gen6(ir_if *ir)
948 {
949 ir_expression *expr = ir->condition->as_expression();
950
951 if (expr && expr->operation != ir_binop_ubo_load) {
952 src_reg op[3];
953 dst_reg temp;
954
955 assert(expr->get_num_operands() <= 3);
956 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
957 expr->operands[i]->accept(this);
958 op[i] = this->result;
959 }
960
961 switch (expr->operation) {
962 case ir_unop_logic_not:
963 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
964 return;
965
966 case ir_binop_logic_xor:
967 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_binop_logic_or:
971 temp = dst_reg(this, glsl_type::bool_type);
972 emit(OR(temp, op[0], op[1]));
973 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
974 return;
975
976 case ir_binop_logic_and:
977 temp = dst_reg(this, glsl_type::bool_type);
978 emit(AND(temp, op[0], op[1]));
979 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
980 return;
981
982 case ir_unop_f2b:
983 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
984 return;
985
986 case ir_unop_i2b:
987 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
988 return;
989
990 case ir_binop_greater:
991 case ir_binop_gequal:
992 case ir_binop_less:
993 case ir_binop_lequal:
994 case ir_binop_equal:
995 case ir_binop_nequal:
996 emit(IF(op[0], op[1],
997 brw_conditional_for_comparison(expr->operation)));
998 return;
999
1000 case ir_binop_all_equal:
1001 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1002 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1003 return;
1004
1005 case ir_binop_any_nequal:
1006 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1007 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1008 return;
1009
1010 case ir_unop_any:
1011 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1012 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1013 return;
1014
1015 case ir_triop_csel: {
1016 /* Expand the boolean condition into the flag register. */
1017 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1018 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1019
1020 /* Select which boolean to return. */
1021 dst_reg temp(this, expr->operands[1]->type);
1022 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1023 inst->predicate = BRW_PREDICATE_NORMAL;
1024
1025 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1026 return;
1027 }
1028
1029 default:
1030 unreachable("not reached");
1031 }
1032 return;
1033 }
1034
1035 ir->condition->accept(this);
1036
1037 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_variable *ir)
1042 {
1043 dst_reg *reg = NULL;
1044
1045 if (variable_storage(ir))
1046 return;
1047
1048 switch (ir->data.mode) {
1049 case ir_var_shader_in:
1050 assert(ir->data.location != -1);
1051 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1052 break;
1053
1054 case ir_var_shader_out:
1055 assert(ir->data.location != -1);
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057
1058 for (int i = 0; i < type_size(ir->type); i++) {
1059 output_reg[ir->data.location + i] = *reg;
1060 output_reg[ir->data.location + i].reg_offset = i;
1061 output_reg_annotation[ir->data.location + i] = ir->name;
1062 }
1063 break;
1064
1065 case ir_var_auto:
1066 case ir_var_temporary:
1067 reg = new(mem_ctx) dst_reg(this, ir->type);
1068 break;
1069
1070 case ir_var_uniform:
1071 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1072
1073 /* Thanks to the lower_ubo_reference pass, we will see only
1074 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1075 * variables, so no need for them to be in variable_ht.
1076 *
1077 * Some uniforms, such as samplers and atomic counters, have no actual
1078 * storage, so we should ignore them.
1079 */
1080 if (ir->is_in_buffer_block() || type_size(ir->type) == 0)
1081 return;
1082
1083 /* Track how big the whole uniform variable is, in case we need to put a
1084 * copy of its data into pull constants for array access.
1085 */
1086 assert(this->uniforms < uniform_array_size);
1087 this->uniform_size[this->uniforms] = type_size(ir->type);
1088
1089 if (!strncmp(ir->name, "gl_", 3)) {
1090 setup_builtin_uniform_values(ir);
1091 } else {
1092 setup_uniform_values(ir);
1093 }
1094 break;
1095
1096 case ir_var_system_value:
1097 reg = make_reg_for_system_value(ir->data.location, ir->type);
1098 break;
1099
1100 default:
1101 unreachable("not reached");
1102 }
1103
1104 reg->type = brw_type_for_base_type(ir->type);
1105 hash_table_insert(this->variable_ht, reg, ir);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop *ir)
1110 {
1111 /* We don't want debugging output to print the whole body of the
1112 * loop as the annotation.
1113 */
1114 this->base_ir = NULL;
1115
1116 emit(BRW_OPCODE_DO);
1117
1118 visit_instructions(&ir->body_instructions);
1119
1120 emit(BRW_OPCODE_WHILE);
1121 }
1122
1123 void
1124 vec4_visitor::visit(ir_loop_jump *ir)
1125 {
1126 switch (ir->mode) {
1127 case ir_loop_jump::jump_break:
1128 emit(BRW_OPCODE_BREAK);
1129 break;
1130 case ir_loop_jump::jump_continue:
1131 emit(BRW_OPCODE_CONTINUE);
1132 break;
1133 }
1134 }
1135
1136
1137 void
1138 vec4_visitor::visit(ir_function_signature *)
1139 {
1140 unreachable("not reached");
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_function *ir)
1145 {
1146 /* Ignore function bodies other than main() -- we shouldn't see calls to
1147 * them since they should all be inlined.
1148 */
1149 if (strcmp(ir->name, "main") == 0) {
1150 const ir_function_signature *sig;
1151 exec_list empty;
1152
1153 sig = ir->matching_signature(NULL, &empty, false);
1154
1155 assert(sig);
1156
1157 visit_instructions(&sig->body);
1158 }
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_mad(ir_expression *ir)
1163 {
1164 /* 3-src instructions were introduced in gen6. */
1165 if (devinfo->gen < 6)
1166 return false;
1167
1168 /* MAD can only handle floating-point data. */
1169 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1170 return false;
1171
1172 ir_rvalue *nonmul;
1173 ir_expression *mul;
1174 bool mul_negate, mul_abs;
1175
1176 for (int i = 0; i < 2; i++) {
1177 mul_negate = false;
1178 mul_abs = false;
1179
1180 mul = ir->operands[i]->as_expression();
1181 nonmul = ir->operands[1 - i];
1182
1183 if (mul && mul->operation == ir_unop_abs) {
1184 mul = mul->operands[0]->as_expression();
1185 mul_abs = true;
1186 } else if (mul && mul->operation == ir_unop_neg) {
1187 mul = mul->operands[0]->as_expression();
1188 mul_negate = true;
1189 }
1190
1191 if (mul && mul->operation == ir_binop_mul)
1192 break;
1193 }
1194
1195 if (!mul || mul->operation != ir_binop_mul)
1196 return false;
1197
1198 nonmul->accept(this);
1199 src_reg src0 = fix_3src_operand(this->result);
1200
1201 mul->operands[0]->accept(this);
1202 src_reg src1 = fix_3src_operand(this->result);
1203 src1.negate ^= mul_negate;
1204 src1.abs = mul_abs;
1205 if (mul_abs)
1206 src1.negate = false;
1207
1208 mul->operands[1]->accept(this);
1209 src_reg src2 = fix_3src_operand(this->result);
1210 src2.abs = mul_abs;
1211 if (mul_abs)
1212 src2.negate = false;
1213
1214 this->result = src_reg(this, ir->type);
1215 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1216
1217 return true;
1218 }
1219
1220 bool
1221 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1222 {
1223 /* This optimization relies on CMP setting the destination to 0 when
1224 * false. Early hardware only sets the least significant bit, and
1225 * leaves the other bits undefined. So we can't use it.
1226 */
1227 if (devinfo->gen < 6)
1228 return false;
1229
1230 ir_expression *const cmp = ir->operands[0]->as_expression();
1231
1232 if (cmp == NULL)
1233 return false;
1234
1235 switch (cmp->operation) {
1236 case ir_binop_less:
1237 case ir_binop_greater:
1238 case ir_binop_lequal:
1239 case ir_binop_gequal:
1240 case ir_binop_equal:
1241 case ir_binop_nequal:
1242 break;
1243
1244 default:
1245 return false;
1246 }
1247
1248 cmp->operands[0]->accept(this);
1249 const src_reg cmp_src0 = this->result;
1250
1251 cmp->operands[1]->accept(this);
1252 const src_reg cmp_src1 = this->result;
1253
1254 this->result = src_reg(this, ir->type);
1255
1256 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1257 brw_conditional_for_comparison(cmp->operation)));
1258
1259 /* If the comparison is false, this->result will just happen to be zero.
1260 */
1261 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1262 this->result, src_reg(1.0f));
1263 inst->predicate = BRW_PREDICATE_NORMAL;
1264 inst->predicate_inverse = true;
1265
1266 return true;
1267 }
1268
1269 vec4_instruction *
1270 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1271 src_reg src0, src_reg src1)
1272 {
1273 vec4_instruction *inst;
1274
1275 if (devinfo->gen >= 6) {
1276 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1277 inst->conditional_mod = conditionalmod;
1278 } else {
1279 emit(CMP(dst, src0, src1, conditionalmod));
1280
1281 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1282 inst->predicate = BRW_PREDICATE_NORMAL;
1283 }
1284
1285 return inst;
1286 }
1287
1288 vec4_instruction *
1289 vec4_visitor::emit_lrp(const dst_reg &dst,
1290 const src_reg &x, const src_reg &y, const src_reg &a)
1291 {
1292 if (devinfo->gen >= 6) {
1293 /* Note that the instruction's argument order is reversed from GLSL
1294 * and the IR.
1295 */
1296 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1297 fix_3src_operand(x)));
1298 } else {
1299 /* Earlier generations don't support three source operations, so we
1300 * need to emit x*(1-a) + y*a.
1301 */
1302 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1303 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1304 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1305 y_times_a.writemask = dst.writemask;
1306 one_minus_a.writemask = dst.writemask;
1307 x_times_one_minus_a.writemask = dst.writemask;
1308
1309 emit(MUL(y_times_a, y, a));
1310 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1311 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1312 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1313 }
1314 }
1315
1316 /**
1317 * Emits the instructions needed to perform a pull constant load. before_block
1318 * and before_inst can be NULL in which case the instruction will be appended
1319 * to the end of the instruction list.
1320 */
1321 void
1322 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1323 src_reg surf_index,
1324 src_reg offset_reg,
1325 bblock_t *before_block,
1326 vec4_instruction *before_inst)
1327 {
1328 assert((before_inst == NULL && before_block == NULL) ||
1329 (before_inst && before_block));
1330
1331 vec4_instruction *pull;
1332
1333 if (devinfo->gen >= 9) {
1334 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1335 src_reg header(this, glsl_type::uvec4_type, 2);
1336
1337 pull = new(mem_ctx)
1338 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1339 dst_reg(header));
1340
1341 if (before_inst)
1342 emit_before(before_block, before_inst, pull);
1343 else
1344 emit(pull);
1345
1346 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1347 offset_reg.type);
1348 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1349
1350 if (before_inst)
1351 emit_before(before_block, before_inst, pull);
1352 else
1353 emit(pull);
1354
1355 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1356 dst,
1357 surf_index,
1358 header);
1359 pull->mlen = 2;
1360 pull->header_size = 1;
1361 } else if (devinfo->gen >= 7) {
1362 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1363
1364 grf_offset.type = offset_reg.type;
1365
1366 pull = MOV(grf_offset, offset_reg);
1367
1368 if (before_inst)
1369 emit_before(before_block, before_inst, pull);
1370 else
1371 emit(pull);
1372
1373 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1374 dst,
1375 surf_index,
1376 src_reg(grf_offset));
1377 pull->mlen = 1;
1378 } else {
1379 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1380 dst,
1381 surf_index,
1382 offset_reg);
1383 pull->base_mrf = 14;
1384 pull->mlen = 1;
1385 }
1386
1387 if (before_inst)
1388 emit_before(before_block, before_inst, pull);
1389 else
1390 emit(pull);
1391 }
1392
1393 src_reg
1394 vec4_visitor::emit_uniformize(const src_reg &src)
1395 {
1396 const src_reg chan_index(this, glsl_type::uint_type);
1397 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1398 src.type);
1399
1400 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1401 ->force_writemask_all = true;
1402 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1403 ->force_writemask_all = true;
1404
1405 return src_reg(dst);
1406 }
1407
1408 void
1409 vec4_visitor::visit(ir_expression *ir)
1410 {
1411 unsigned int operand;
1412 src_reg op[ARRAY_SIZE(ir->operands)];
1413 vec4_instruction *inst;
1414
1415 if (ir->operation == ir_binop_add) {
1416 if (try_emit_mad(ir))
1417 return;
1418 }
1419
1420 if (ir->operation == ir_unop_b2f) {
1421 if (try_emit_b2f_of_compare(ir))
1422 return;
1423 }
1424
1425 /* Storage for our result. Ideally for an assignment we'd be using
1426 * the actual storage for the result here, instead.
1427 */
1428 dst_reg result_dst(this, ir->type);
1429 src_reg result_src(result_dst);
1430
1431 if (ir->operation == ir_triop_csel) {
1432 ir->operands[1]->accept(this);
1433 op[1] = this->result;
1434 ir->operands[2]->accept(this);
1435 op[2] = this->result;
1436
1437 enum brw_predicate predicate;
1438 emit_bool_to_cond_code(ir->operands[0], &predicate);
1439 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1440 inst->predicate = predicate;
1441 this->result = result_src;
1442 return;
1443 }
1444
1445 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1446 this->result.file = BAD_FILE;
1447 ir->operands[operand]->accept(this);
1448 if (this->result.file == BAD_FILE) {
1449 fprintf(stderr, "Failed to get tree for expression operand:\n");
1450 ir->operands[operand]->fprint(stderr);
1451 exit(1);
1452 }
1453 op[operand] = this->result;
1454
1455 /* Matrix expression operands should have been broken down to vector
1456 * operations already.
1457 */
1458 assert(!ir->operands[operand]->type->is_matrix());
1459 }
1460
1461 /* If nothing special happens, this is the result. */
1462 this->result = result_src;
1463
1464 switch (ir->operation) {
1465 case ir_unop_logic_not:
1466 emit(NOT(result_dst, op[0]));
1467 break;
1468 case ir_unop_neg:
1469 op[0].negate = !op[0].negate;
1470 emit(MOV(result_dst, op[0]));
1471 break;
1472 case ir_unop_abs:
1473 op[0].abs = true;
1474 op[0].negate = false;
1475 emit(MOV(result_dst, op[0]));
1476 break;
1477
1478 case ir_unop_sign:
1479 if (ir->type->is_float()) {
1480 /* AND(val, 0x80000000) gives the sign bit.
1481 *
1482 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1483 * zero.
1484 */
1485 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486
1487 op[0].type = BRW_REGISTER_TYPE_UD;
1488 result_dst.type = BRW_REGISTER_TYPE_UD;
1489 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1490
1491 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1492 inst->predicate = BRW_PREDICATE_NORMAL;
1493
1494 this->result.type = BRW_REGISTER_TYPE_F;
1495 } else {
1496 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1497 * -> non-negative val generates 0x00000000.
1498 * Predicated OR sets 1 if val is positive.
1499 */
1500 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1501
1502 emit(ASR(result_dst, op[0], src_reg(31)));
1503
1504 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1505 inst->predicate = BRW_PREDICATE_NORMAL;
1506 }
1507 break;
1508
1509 case ir_unop_rcp:
1510 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1511 break;
1512
1513 case ir_unop_exp2:
1514 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1515 break;
1516 case ir_unop_log2:
1517 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1518 break;
1519 case ir_unop_exp:
1520 case ir_unop_log:
1521 unreachable("not reached: should be handled by ir_explog_to_explog2");
1522 case ir_unop_sin:
1523 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1524 break;
1525 case ir_unop_cos:
1526 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1527 break;
1528
1529 case ir_unop_dFdx:
1530 case ir_unop_dFdx_coarse:
1531 case ir_unop_dFdx_fine:
1532 case ir_unop_dFdy:
1533 case ir_unop_dFdy_coarse:
1534 case ir_unop_dFdy_fine:
1535 unreachable("derivatives not valid in vertex shader");
1536
1537 case ir_unop_bitfield_reverse:
1538 emit(BFREV(result_dst, op[0]));
1539 break;
1540 case ir_unop_bit_count:
1541 emit(CBIT(result_dst, op[0]));
1542 break;
1543 case ir_unop_find_msb: {
1544 src_reg temp = src_reg(this, glsl_type::uint_type);
1545
1546 inst = emit(FBH(dst_reg(temp), op[0]));
1547 inst->dst.writemask = WRITEMASK_XYZW;
1548
1549 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1550 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1551 * subtract the result from 31 to convert the MSB count into an LSB count.
1552 */
1553
1554 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1555 temp.swizzle = BRW_SWIZZLE_NOOP;
1556 emit(MOV(result_dst, temp));
1557
1558 src_reg src_tmp = src_reg(result_dst);
1559 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1560
1561 src_tmp.negate = true;
1562 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1563 inst->predicate = BRW_PREDICATE_NORMAL;
1564 break;
1565 }
1566 case ir_unop_find_lsb:
1567 emit(FBL(result_dst, op[0]));
1568 break;
1569 case ir_unop_saturate:
1570 inst = emit(MOV(result_dst, op[0]));
1571 inst->saturate = true;
1572 break;
1573
1574 case ir_unop_noise:
1575 unreachable("not reached: should be handled by lower_noise");
1576
1577 case ir_unop_subroutine_to_int:
1578 emit(MOV(result_dst, op[0]));
1579 break;
1580
1581 case ir_binop_add:
1582 emit(ADD(result_dst, op[0], op[1]));
1583 break;
1584 case ir_binop_sub:
1585 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1586
1587 case ir_binop_mul:
1588 if (devinfo->gen < 8 && ir->type->is_integer()) {
1589 /* For integer multiplication, the MUL uses the low 16 bits of one of
1590 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1591 * accumulates in the contribution of the upper 16 bits of that
1592 * operand. If we can determine that one of the args is in the low
1593 * 16 bits, though, we can just emit a single MUL.
1594 */
1595 if (ir->operands[0]->is_uint16_constant()) {
1596 if (devinfo->gen < 7)
1597 emit(MUL(result_dst, op[0], op[1]));
1598 else
1599 emit(MUL(result_dst, op[1], op[0]));
1600 } else if (ir->operands[1]->is_uint16_constant()) {
1601 if (devinfo->gen < 7)
1602 emit(MUL(result_dst, op[1], op[0]));
1603 else
1604 emit(MUL(result_dst, op[0], op[1]));
1605 } else {
1606 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1607
1608 emit(MUL(acc, op[0], op[1]));
1609 emit(MACH(dst_null_d(), op[0], op[1]));
1610 emit(MOV(result_dst, src_reg(acc)));
1611 }
1612 } else {
1613 emit(MUL(result_dst, op[0], op[1]));
1614 }
1615 break;
1616 case ir_binop_imul_high: {
1617 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1618
1619 emit(MUL(acc, op[0], op[1]));
1620 emit(MACH(result_dst, op[0], op[1]));
1621 break;
1622 }
1623 case ir_binop_div:
1624 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1625 assert(ir->type->is_integer());
1626 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1627 break;
1628
1629 case ir_binop_carry:
1630 unreachable("Should have been lowered by carry_to_arith().");
1631
1632 case ir_binop_borrow:
1633 unreachable("Should have been lowered by borrow_to_arith().");
1634
1635 case ir_binop_mod:
1636 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1637 assert(ir->type->is_integer());
1638 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1639 break;
1640
1641 case ir_binop_less:
1642 case ir_binop_greater:
1643 case ir_binop_lequal:
1644 case ir_binop_gequal:
1645 case ir_binop_equal:
1646 case ir_binop_nequal: {
1647 if (devinfo->gen <= 5) {
1648 resolve_bool_comparison(ir->operands[0], &op[0]);
1649 resolve_bool_comparison(ir->operands[1], &op[1]);
1650 }
1651 emit(CMP(result_dst, op[0], op[1],
1652 brw_conditional_for_comparison(ir->operation)));
1653 break;
1654 }
1655
1656 case ir_binop_all_equal:
1657 if (devinfo->gen <= 5) {
1658 resolve_bool_comparison(ir->operands[0], &op[0]);
1659 resolve_bool_comparison(ir->operands[1], &op[1]);
1660 }
1661
1662 /* "==" operator producing a scalar boolean. */
1663 if (ir->operands[0]->type->is_vector() ||
1664 ir->operands[1]->type->is_vector()) {
1665 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1666 emit(MOV(result_dst, src_reg(0)));
1667 inst = emit(MOV(result_dst, src_reg(~0)));
1668 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1669 } else {
1670 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1671 }
1672 break;
1673 case ir_binop_any_nequal:
1674 if (devinfo->gen <= 5) {
1675 resolve_bool_comparison(ir->operands[0], &op[0]);
1676 resolve_bool_comparison(ir->operands[1], &op[1]);
1677 }
1678
1679 /* "!=" operator producing a scalar boolean. */
1680 if (ir->operands[0]->type->is_vector() ||
1681 ir->operands[1]->type->is_vector()) {
1682 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1683
1684 emit(MOV(result_dst, src_reg(0)));
1685 inst = emit(MOV(result_dst, src_reg(~0)));
1686 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1687 } else {
1688 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1689 }
1690 break;
1691
1692 case ir_unop_any:
1693 if (devinfo->gen <= 5) {
1694 resolve_bool_comparison(ir->operands[0], &op[0]);
1695 }
1696 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1697 emit(MOV(result_dst, src_reg(0)));
1698
1699 inst = emit(MOV(result_dst, src_reg(~0)));
1700 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1701 break;
1702
1703 case ir_binop_logic_xor:
1704 emit(XOR(result_dst, op[0], op[1]));
1705 break;
1706
1707 case ir_binop_logic_or:
1708 emit(OR(result_dst, op[0], op[1]));
1709 break;
1710
1711 case ir_binop_logic_and:
1712 emit(AND(result_dst, op[0], op[1]));
1713 break;
1714
1715 case ir_binop_dot:
1716 assert(ir->operands[0]->type->is_vector());
1717 assert(ir->operands[0]->type == ir->operands[1]->type);
1718 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1719 break;
1720
1721 case ir_unop_sqrt:
1722 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1723 break;
1724 case ir_unop_rsq:
1725 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1726 break;
1727
1728 case ir_unop_bitcast_i2f:
1729 case ir_unop_bitcast_u2f:
1730 this->result = op[0];
1731 this->result.type = BRW_REGISTER_TYPE_F;
1732 break;
1733
1734 case ir_unop_bitcast_f2i:
1735 this->result = op[0];
1736 this->result.type = BRW_REGISTER_TYPE_D;
1737 break;
1738
1739 case ir_unop_bitcast_f2u:
1740 this->result = op[0];
1741 this->result.type = BRW_REGISTER_TYPE_UD;
1742 break;
1743
1744 case ir_unop_i2f:
1745 case ir_unop_i2u:
1746 case ir_unop_u2i:
1747 case ir_unop_u2f:
1748 case ir_unop_f2i:
1749 case ir_unop_f2u:
1750 emit(MOV(result_dst, op[0]));
1751 break;
1752 case ir_unop_b2i:
1753 case ir_unop_b2f:
1754 if (devinfo->gen <= 5) {
1755 resolve_bool_comparison(ir->operands[0], &op[0]);
1756 }
1757 emit(MOV(result_dst, negate(op[0])));
1758 break;
1759 case ir_unop_f2b:
1760 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1761 break;
1762 case ir_unop_i2b:
1763 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1764 break;
1765
1766 case ir_unop_trunc:
1767 emit(RNDZ(result_dst, op[0]));
1768 break;
1769 case ir_unop_ceil: {
1770 src_reg tmp = src_reg(this, ir->type);
1771 op[0].negate = !op[0].negate;
1772 emit(RNDD(dst_reg(tmp), op[0]));
1773 tmp.negate = true;
1774 emit(MOV(result_dst, tmp));
1775 }
1776 break;
1777 case ir_unop_floor:
1778 inst = emit(RNDD(result_dst, op[0]));
1779 break;
1780 case ir_unop_fract:
1781 inst = emit(FRC(result_dst, op[0]));
1782 break;
1783 case ir_unop_round_even:
1784 emit(RNDE(result_dst, op[0]));
1785 break;
1786
1787 case ir_binop_min:
1788 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1789 break;
1790 case ir_binop_max:
1791 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1792 break;
1793
1794 case ir_binop_pow:
1795 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1796 break;
1797
1798 case ir_unop_bit_not:
1799 inst = emit(NOT(result_dst, op[0]));
1800 break;
1801 case ir_binop_bit_and:
1802 inst = emit(AND(result_dst, op[0], op[1]));
1803 break;
1804 case ir_binop_bit_xor:
1805 inst = emit(XOR(result_dst, op[0], op[1]));
1806 break;
1807 case ir_binop_bit_or:
1808 inst = emit(OR(result_dst, op[0], op[1]));
1809 break;
1810
1811 case ir_binop_lshift:
1812 inst = emit(SHL(result_dst, op[0], op[1]));
1813 break;
1814
1815 case ir_binop_rshift:
1816 if (ir->type->base_type == GLSL_TYPE_INT)
1817 inst = emit(ASR(result_dst, op[0], op[1]));
1818 else
1819 inst = emit(SHR(result_dst, op[0], op[1]));
1820 break;
1821
1822 case ir_binop_bfm:
1823 emit(BFI1(result_dst, op[0], op[1]));
1824 break;
1825
1826 case ir_binop_ubo_load: {
1827 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1828 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1829 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1830 src_reg offset;
1831
1832 /* Now, load the vector from that offset. */
1833 assert(ir->type->is_vector() || ir->type->is_scalar());
1834
1835 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1836 packed_consts.type = result.type;
1837 src_reg surf_index;
1838
1839 if (const_uniform_block) {
1840 /* The block index is a constant, so just emit the binding table entry
1841 * as an immediate.
1842 */
1843 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1844 const_uniform_block->value.u[0]);
1845 } else {
1846 /* The block index is not a constant. Evaluate the index expression
1847 * per-channel and add the base UBO index; we have to select a value
1848 * from any live channel.
1849 */
1850 surf_index = src_reg(this, glsl_type::uint_type);
1851 emit(ADD(dst_reg(surf_index), op[0],
1852 src_reg(prog_data->base.binding_table.ubo_start)));
1853 surf_index = emit_uniformize(surf_index);
1854
1855 /* Assume this may touch any UBO. It would be nice to provide
1856 * a tighter bound, but the array information is already lowered away.
1857 */
1858 brw_mark_surface_used(&prog_data->base,
1859 prog_data->base.binding_table.ubo_start +
1860 shader_prog->NumUniformBlocks - 1);
1861 }
1862
1863 if (const_offset_ir) {
1864 if (devinfo->gen >= 8) {
1865 /* Store the offset in a GRF so we can send-from-GRF. */
1866 offset = src_reg(this, glsl_type::int_type);
1867 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1868 } else {
1869 /* Immediates are fine on older generations since they'll be moved
1870 * to a (potentially fake) MRF at the generator level.
1871 */
1872 offset = src_reg(const_offset / 16);
1873 }
1874 } else {
1875 offset = src_reg(this, glsl_type::uint_type);
1876 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1877 }
1878
1879 emit_pull_constant_load_reg(dst_reg(packed_consts),
1880 surf_index,
1881 offset,
1882 NULL, NULL /* before_block/inst */);
1883
1884 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1885 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1886 const_offset % 16 / 4,
1887 const_offset % 16 / 4,
1888 const_offset % 16 / 4);
1889
1890 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1891 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1892 emit(CMP(result_dst, packed_consts, src_reg(0u),
1893 BRW_CONDITIONAL_NZ));
1894 } else {
1895 emit(MOV(result_dst, packed_consts));
1896 }
1897 break;
1898 }
1899
1900 case ir_binop_vector_extract:
1901 unreachable("should have been lowered by vec_index_to_cond_assign");
1902
1903 case ir_triop_fma:
1904 op[0] = fix_3src_operand(op[0]);
1905 op[1] = fix_3src_operand(op[1]);
1906 op[2] = fix_3src_operand(op[2]);
1907 /* Note that the instruction's argument order is reversed from GLSL
1908 * and the IR.
1909 */
1910 emit(MAD(result_dst, op[2], op[1], op[0]));
1911 break;
1912
1913 case ir_triop_lrp:
1914 emit_lrp(result_dst, op[0], op[1], op[2]);
1915 break;
1916
1917 case ir_triop_csel:
1918 unreachable("already handled above");
1919 break;
1920
1921 case ir_triop_bfi:
1922 op[0] = fix_3src_operand(op[0]);
1923 op[1] = fix_3src_operand(op[1]);
1924 op[2] = fix_3src_operand(op[2]);
1925 emit(BFI2(result_dst, op[0], op[1], op[2]));
1926 break;
1927
1928 case ir_triop_bitfield_extract:
1929 op[0] = fix_3src_operand(op[0]);
1930 op[1] = fix_3src_operand(op[1]);
1931 op[2] = fix_3src_operand(op[2]);
1932 /* Note that the instruction's argument order is reversed from GLSL
1933 * and the IR.
1934 */
1935 emit(BFE(result_dst, op[2], op[1], op[0]));
1936 break;
1937
1938 case ir_triop_vector_insert:
1939 unreachable("should have been lowered by lower_vector_insert");
1940
1941 case ir_quadop_bitfield_insert:
1942 unreachable("not reached: should be handled by "
1943 "bitfield_insert_to_bfm_bfi\n");
1944
1945 case ir_quadop_vector:
1946 unreachable("not reached: should be handled by lower_quadop_vector");
1947
1948 case ir_unop_pack_half_2x16:
1949 emit_pack_half_2x16(result_dst, op[0]);
1950 break;
1951 case ir_unop_unpack_half_2x16:
1952 emit_unpack_half_2x16(result_dst, op[0]);
1953 break;
1954 case ir_unop_unpack_unorm_4x8:
1955 emit_unpack_unorm_4x8(result_dst, op[0]);
1956 break;
1957 case ir_unop_unpack_snorm_4x8:
1958 emit_unpack_snorm_4x8(result_dst, op[0]);
1959 break;
1960 case ir_unop_pack_unorm_4x8:
1961 emit_pack_unorm_4x8(result_dst, op[0]);
1962 break;
1963 case ir_unop_pack_snorm_4x8:
1964 emit_pack_snorm_4x8(result_dst, op[0]);
1965 break;
1966 case ir_unop_pack_snorm_2x16:
1967 case ir_unop_pack_unorm_2x16:
1968 case ir_unop_unpack_snorm_2x16:
1969 case ir_unop_unpack_unorm_2x16:
1970 unreachable("not reached: should be handled by lower_packing_builtins");
1971 case ir_unop_unpack_half_2x16_split_x:
1972 case ir_unop_unpack_half_2x16_split_y:
1973 case ir_binop_pack_half_2x16_split:
1974 case ir_unop_interpolate_at_centroid:
1975 case ir_binop_interpolate_at_sample:
1976 case ir_binop_interpolate_at_offset:
1977 unreachable("not reached: should not occur in vertex shader");
1978 case ir_binop_ldexp:
1979 unreachable("not reached: should be handled by ldexp_to_arith()");
1980 case ir_unop_d2f:
1981 case ir_unop_f2d:
1982 case ir_unop_d2i:
1983 case ir_unop_i2d:
1984 case ir_unop_d2u:
1985 case ir_unop_u2d:
1986 case ir_unop_d2b:
1987 case ir_unop_pack_double_2x32:
1988 case ir_unop_unpack_double_2x32:
1989 case ir_unop_frexp_sig:
1990 case ir_unop_frexp_exp:
1991 unreachable("fp64 todo");
1992 }
1993 }
1994
1995
1996 void
1997 vec4_visitor::visit(ir_swizzle *ir)
1998 {
1999 /* Note that this is only swizzles in expressions, not those on the left
2000 * hand side of an assignment, which do write masking. See ir_assignment
2001 * for that.
2002 */
2003 const unsigned swz = brw_compose_swizzle(
2004 brw_swizzle_for_size(ir->type->vector_elements),
2005 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2006
2007 ir->val->accept(this);
2008 this->result = swizzle(this->result, swz);
2009 }
2010
2011 void
2012 vec4_visitor::visit(ir_dereference_variable *ir)
2013 {
2014 const struct glsl_type *type = ir->type;
2015 dst_reg *reg = variable_storage(ir->var);
2016
2017 if (!reg) {
2018 fail("Failed to find variable storage for %s\n", ir->var->name);
2019 this->result = src_reg(brw_null_reg());
2020 return;
2021 }
2022
2023 this->result = src_reg(*reg);
2024
2025 /* System values get their swizzle from the dst_reg writemask */
2026 if (ir->var->data.mode == ir_var_system_value)
2027 return;
2028
2029 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2030 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2031 }
2032
2033
2034 int
2035 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2036 {
2037 /* Under normal circumstances array elements are stored consecutively, so
2038 * the stride is equal to the size of the array element.
2039 */
2040 return type_size(ir->type);
2041 }
2042
2043
2044 void
2045 vec4_visitor::visit(ir_dereference_array *ir)
2046 {
2047 ir_constant *constant_index;
2048 src_reg src;
2049 int array_stride = compute_array_stride(ir);
2050
2051 constant_index = ir->array_index->constant_expression_value();
2052
2053 ir->array->accept(this);
2054 src = this->result;
2055
2056 if (constant_index) {
2057 src.reg_offset += constant_index->value.i[0] * array_stride;
2058 } else {
2059 /* Variable index array dereference. It eats the "vec4" of the
2060 * base of the array and an index that offsets the Mesa register
2061 * index.
2062 */
2063 ir->array_index->accept(this);
2064
2065 src_reg index_reg;
2066
2067 if (array_stride == 1) {
2068 index_reg = this->result;
2069 } else {
2070 index_reg = src_reg(this, glsl_type::int_type);
2071
2072 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2073 }
2074
2075 if (src.reladdr) {
2076 src_reg temp = src_reg(this, glsl_type::int_type);
2077
2078 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2079
2080 index_reg = temp;
2081 }
2082
2083 src.reladdr = ralloc(mem_ctx, src_reg);
2084 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2085 }
2086
2087 /* If the type is smaller than a vec4, replicate the last channel out. */
2088 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2089 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2090 else
2091 src.swizzle = BRW_SWIZZLE_NOOP;
2092 src.type = brw_type_for_base_type(ir->type);
2093
2094 this->result = src;
2095 }
2096
2097 void
2098 vec4_visitor::visit(ir_dereference_record *ir)
2099 {
2100 unsigned int i;
2101 const glsl_type *struct_type = ir->record->type;
2102 int offset = 0;
2103
2104 ir->record->accept(this);
2105
2106 for (i = 0; i < struct_type->length; i++) {
2107 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2108 break;
2109 offset += type_size(struct_type->fields.structure[i].type);
2110 }
2111
2112 /* If the type is smaller than a vec4, replicate the last channel out. */
2113 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2114 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2115 else
2116 this->result.swizzle = BRW_SWIZZLE_NOOP;
2117 this->result.type = brw_type_for_base_type(ir->type);
2118
2119 this->result.reg_offset += offset;
2120 }
2121
2122 /**
2123 * We want to be careful in assignment setup to hit the actual storage
2124 * instead of potentially using a temporary like we might with the
2125 * ir_dereference handler.
2126 */
2127 static dst_reg
2128 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2129 {
2130 /* The LHS must be a dereference. If the LHS is a variable indexed array
2131 * access of a vector, it must be separated into a series conditional moves
2132 * before reaching this point (see ir_vec_index_to_cond_assign).
2133 */
2134 assert(ir->as_dereference());
2135 ir_dereference_array *deref_array = ir->as_dereference_array();
2136 if (deref_array) {
2137 assert(!deref_array->array->type->is_vector());
2138 }
2139
2140 /* Use the rvalue deref handler for the most part. We'll ignore
2141 * swizzles in it and write swizzles using writemask, though.
2142 */
2143 ir->accept(v);
2144 return dst_reg(v->result);
2145 }
2146
2147 void
2148 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2149 const struct glsl_type *type,
2150 enum brw_predicate predicate)
2151 {
2152 if (type->base_type == GLSL_TYPE_STRUCT) {
2153 for (unsigned int i = 0; i < type->length; i++) {
2154 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2155 }
2156 return;
2157 }
2158
2159 if (type->is_array()) {
2160 for (unsigned int i = 0; i < type->length; i++) {
2161 emit_block_move(dst, src, type->fields.array, predicate);
2162 }
2163 return;
2164 }
2165
2166 if (type->is_matrix()) {
2167 const struct glsl_type *vec_type;
2168
2169 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2170 type->vector_elements, 1);
2171
2172 for (int i = 0; i < type->matrix_columns; i++) {
2173 emit_block_move(dst, src, vec_type, predicate);
2174 }
2175 return;
2176 }
2177
2178 assert(type->is_scalar() || type->is_vector());
2179
2180 dst->type = brw_type_for_base_type(type);
2181 src->type = dst->type;
2182
2183 dst->writemask = (1 << type->vector_elements) - 1;
2184
2185 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2186
2187 vec4_instruction *inst = emit(MOV(*dst, *src));
2188 inst->predicate = predicate;
2189
2190 dst->reg_offset++;
2191 src->reg_offset++;
2192 }
2193
2194
2195 /* If the RHS processing resulted in an instruction generating a
2196 * temporary value, and it would be easy to rewrite the instruction to
2197 * generate its result right into the LHS instead, do so. This ends
2198 * up reliably removing instructions where it can be tricky to do so
2199 * later without real UD chain information.
2200 */
2201 bool
2202 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2203 dst_reg dst,
2204 src_reg src,
2205 vec4_instruction *pre_rhs_inst,
2206 vec4_instruction *last_rhs_inst)
2207 {
2208 /* This could be supported, but it would take more smarts. */
2209 if (ir->condition)
2210 return false;
2211
2212 if (pre_rhs_inst == last_rhs_inst)
2213 return false; /* No instructions generated to work with. */
2214
2215 /* Make sure the last instruction generated our source reg. */
2216 if (src.file != GRF ||
2217 src.file != last_rhs_inst->dst.file ||
2218 src.reg != last_rhs_inst->dst.reg ||
2219 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2220 src.reladdr ||
2221 src.abs ||
2222 src.negate ||
2223 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2224 return false;
2225
2226 /* Check that that last instruction fully initialized the channels
2227 * we want to use, in the order we want to use them. We could
2228 * potentially reswizzle the operands of many instructions so that
2229 * we could handle out of order channels, but don't yet.
2230 */
2231
2232 for (unsigned i = 0; i < 4; i++) {
2233 if (dst.writemask & (1 << i)) {
2234 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2235 return false;
2236
2237 if (BRW_GET_SWZ(src.swizzle, i) != i)
2238 return false;
2239 }
2240 }
2241
2242 /* Success! Rewrite the instruction. */
2243 last_rhs_inst->dst.file = dst.file;
2244 last_rhs_inst->dst.reg = dst.reg;
2245 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2246 last_rhs_inst->dst.reladdr = dst.reladdr;
2247 last_rhs_inst->dst.writemask &= dst.writemask;
2248
2249 return true;
2250 }
2251
2252 void
2253 vec4_visitor::visit(ir_assignment *ir)
2254 {
2255 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2256 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2257
2258 if (!ir->lhs->type->is_scalar() &&
2259 !ir->lhs->type->is_vector()) {
2260 ir->rhs->accept(this);
2261 src_reg src = this->result;
2262
2263 if (ir->condition) {
2264 emit_bool_to_cond_code(ir->condition, &predicate);
2265 }
2266
2267 /* emit_block_move doesn't account for swizzles in the source register.
2268 * This should be ok, since the source register is a structure or an
2269 * array, and those can't be swizzled. But double-check to be sure.
2270 */
2271 assert(src.swizzle ==
2272 (ir->rhs->type->is_matrix()
2273 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2274 : BRW_SWIZZLE_NOOP));
2275
2276 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2277 return;
2278 }
2279
2280 /* Now we're down to just a scalar/vector with writemasks. */
2281 int i;
2282
2283 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2284 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2285
2286 ir->rhs->accept(this);
2287
2288 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2289
2290 int swizzles[4];
2291 int src_chan = 0;
2292
2293 assert(ir->lhs->type->is_vector() ||
2294 ir->lhs->type->is_scalar());
2295 dst.writemask = ir->write_mask;
2296
2297 /* Swizzle a small RHS vector into the channels being written.
2298 *
2299 * glsl ir treats write_mask as dictating how many channels are
2300 * present on the RHS while in our instructions we need to make
2301 * those channels appear in the slots of the vec4 they're written to.
2302 */
2303 for (int i = 0; i < 4; i++)
2304 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2305
2306 src_reg src = swizzle(this->result,
2307 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2308 swizzles[2], swizzles[3]));
2309
2310 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2311 return;
2312 }
2313
2314 if (ir->condition) {
2315 emit_bool_to_cond_code(ir->condition, &predicate);
2316 }
2317
2318 for (i = 0; i < type_size(ir->lhs->type); i++) {
2319 vec4_instruction *inst = emit(MOV(dst, src));
2320 inst->predicate = predicate;
2321
2322 dst.reg_offset++;
2323 src.reg_offset++;
2324 }
2325 }
2326
2327 void
2328 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2329 {
2330 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2331 foreach_in_list(ir_constant, field_value, &ir->components) {
2332 emit_constant_values(dst, field_value);
2333 }
2334 return;
2335 }
2336
2337 if (ir->type->is_array()) {
2338 for (unsigned int i = 0; i < ir->type->length; i++) {
2339 emit_constant_values(dst, ir->array_elements[i]);
2340 }
2341 return;
2342 }
2343
2344 if (ir->type->is_matrix()) {
2345 for (int i = 0; i < ir->type->matrix_columns; i++) {
2346 float *vec = &ir->value.f[i * ir->type->vector_elements];
2347
2348 for (int j = 0; j < ir->type->vector_elements; j++) {
2349 dst->writemask = 1 << j;
2350 dst->type = BRW_REGISTER_TYPE_F;
2351
2352 emit(MOV(*dst, src_reg(vec[j])));
2353 }
2354 dst->reg_offset++;
2355 }
2356 return;
2357 }
2358
2359 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2360
2361 for (int i = 0; i < ir->type->vector_elements; i++) {
2362 if (!(remaining_writemask & (1 << i)))
2363 continue;
2364
2365 dst->writemask = 1 << i;
2366 dst->type = brw_type_for_base_type(ir->type);
2367
2368 /* Find other components that match the one we're about to
2369 * write. Emits fewer instructions for things like vec4(0.5,
2370 * 1.5, 1.5, 1.5).
2371 */
2372 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2373 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2374 if (ir->value.b[i] == ir->value.b[j])
2375 dst->writemask |= (1 << j);
2376 } else {
2377 /* u, i, and f storage all line up, so no need for a
2378 * switch case for comparing each type.
2379 */
2380 if (ir->value.u[i] == ir->value.u[j])
2381 dst->writemask |= (1 << j);
2382 }
2383 }
2384
2385 switch (ir->type->base_type) {
2386 case GLSL_TYPE_FLOAT:
2387 emit(MOV(*dst, src_reg(ir->value.f[i])));
2388 break;
2389 case GLSL_TYPE_INT:
2390 emit(MOV(*dst, src_reg(ir->value.i[i])));
2391 break;
2392 case GLSL_TYPE_UINT:
2393 emit(MOV(*dst, src_reg(ir->value.u[i])));
2394 break;
2395 case GLSL_TYPE_BOOL:
2396 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2397 break;
2398 default:
2399 unreachable("Non-float/uint/int/bool constant");
2400 }
2401
2402 remaining_writemask &= ~dst->writemask;
2403 }
2404 dst->reg_offset++;
2405 }
2406
2407 void
2408 vec4_visitor::visit(ir_constant *ir)
2409 {
2410 dst_reg dst = dst_reg(this, ir->type);
2411 this->result = src_reg(dst);
2412
2413 emit_constant_values(&dst, ir);
2414 }
2415
2416 void
2417 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2418 {
2419 ir_dereference *deref = static_cast<ir_dereference *>(
2420 ir->actual_parameters.get_head());
2421 ir_variable *location = deref->variable_referenced();
2422 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2423 location->data.binding);
2424
2425 /* Calculate the surface offset */
2426 src_reg offset(this, glsl_type::uint_type);
2427 ir_dereference_array *deref_array = deref->as_dereference_array();
2428 if (deref_array) {
2429 deref_array->array_index->accept(this);
2430
2431 src_reg tmp(this, glsl_type::uint_type);
2432 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2433 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2434 } else {
2435 offset = location->data.atomic.offset;
2436 }
2437
2438 /* Emit the appropriate machine instruction */
2439 const char *callee = ir->callee->function_name();
2440 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2441
2442 if (!strcmp("__intrinsic_atomic_read", callee)) {
2443 emit_untyped_surface_read(surf_index, dst, offset);
2444
2445 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2446 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2447 src_reg(), src_reg());
2448
2449 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2450 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2451 src_reg(), src_reg());
2452 }
2453
2454 brw_mark_surface_used(stage_prog_data, surf_index);
2455 }
2456
2457 void
2458 vec4_visitor::visit(ir_call *ir)
2459 {
2460 const char *callee = ir->callee->function_name();
2461
2462 if (!strcmp("__intrinsic_atomic_read", callee) ||
2463 !strcmp("__intrinsic_atomic_increment", callee) ||
2464 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2465 visit_atomic_counter_intrinsic(ir);
2466 } else {
2467 unreachable("Unsupported intrinsic.");
2468 }
2469 }
2470
2471 src_reg
2472 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2473 {
2474 vec4_instruction *inst =
2475 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2476 dst_reg(this, glsl_type::uvec4_type));
2477 inst->base_mrf = 2;
2478 inst->src[1] = sampler;
2479
2480 int param_base;
2481
2482 if (devinfo->gen >= 9) {
2483 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2484 vec4_instruction *header_inst = new(mem_ctx)
2485 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2486 dst_reg(MRF, inst->base_mrf));
2487
2488 emit(header_inst);
2489
2490 inst->mlen = 2;
2491 inst->header_size = 1;
2492 param_base = inst->base_mrf + 1;
2493 } else {
2494 inst->mlen = 1;
2495 param_base = inst->base_mrf;
2496 }
2497
2498 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2499 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2500 int zero_mask = 0xf & ~coord_mask;
2501
2502 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2503 coordinate));
2504
2505 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2506 src_reg(0)));
2507
2508 emit(inst);
2509 return src_reg(inst->dst);
2510 }
2511
2512 bool
2513 vec4_visitor::is_high_sampler(src_reg sampler)
2514 {
2515 if (devinfo->gen < 8 && !devinfo->is_haswell)
2516 return false;
2517
2518 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2519 }
2520
2521 void
2522 vec4_visitor::visit(ir_texture *ir)
2523 {
2524 uint32_t sampler =
2525 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2526
2527 ir_rvalue *nonconst_sampler_index =
2528 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2529
2530 /* Handle non-constant sampler array indexing */
2531 src_reg sampler_reg;
2532 if (nonconst_sampler_index) {
2533 /* The highest sampler which may be used by this operation is
2534 * the last element of the array. Mark it here, because the generator
2535 * doesn't have enough information to determine the bound.
2536 */
2537 uint32_t array_size = ir->sampler->as_dereference_array()
2538 ->array->type->array_size();
2539
2540 uint32_t max_used = sampler + array_size - 1;
2541 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2542 max_used += prog_data->base.binding_table.gather_texture_start;
2543 } else {
2544 max_used += prog_data->base.binding_table.texture_start;
2545 }
2546
2547 brw_mark_surface_used(&prog_data->base, max_used);
2548
2549 /* Emit code to evaluate the actual indexing expression */
2550 nonconst_sampler_index->accept(this);
2551 src_reg temp(this, glsl_type::uint_type);
2552 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2553 sampler_reg = emit_uniformize(temp);
2554 } else {
2555 /* Single sampler, or constant array index; the indexing expression
2556 * is just an immediate.
2557 */
2558 sampler_reg = src_reg(sampler);
2559 }
2560
2561 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2562 * emitting anything other than setting up the constant result.
2563 */
2564 if (ir->op == ir_tg4) {
2565 ir_constant *chan = ir->lod_info.component->as_constant();
2566 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2567 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2568 dst_reg result(this, ir->type);
2569 this->result = src_reg(result);
2570 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2571 return;
2572 }
2573 }
2574
2575 /* Should be lowered by do_lower_texture_projection */
2576 assert(!ir->projector);
2577
2578 /* Should be lowered */
2579 assert(!ir->offset || !ir->offset->type->is_array());
2580
2581 /* Generate code to compute all the subexpression trees. This has to be
2582 * done before loading any values into MRFs for the sampler message since
2583 * generating these values may involve SEND messages that need the MRFs.
2584 */
2585 src_reg coordinate;
2586 if (ir->coordinate) {
2587 ir->coordinate->accept(this);
2588 coordinate = this->result;
2589 }
2590
2591 src_reg shadow_comparitor;
2592 if (ir->shadow_comparitor) {
2593 ir->shadow_comparitor->accept(this);
2594 shadow_comparitor = this->result;
2595 }
2596
2597 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2598 src_reg offset_value;
2599 if (has_nonconstant_offset) {
2600 ir->offset->accept(this);
2601 offset_value = src_reg(this->result);
2602 }
2603
2604 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2605 src_reg lod, dPdx, dPdy, sample_index, mcs;
2606 switch (ir->op) {
2607 case ir_tex:
2608 lod = src_reg(0.0f);
2609 lod_type = glsl_type::float_type;
2610 break;
2611 case ir_txf:
2612 case ir_txl:
2613 case ir_txs:
2614 ir->lod_info.lod->accept(this);
2615 lod = this->result;
2616 lod_type = ir->lod_info.lod->type;
2617 break;
2618 case ir_query_levels:
2619 lod = src_reg(0);
2620 lod_type = glsl_type::int_type;
2621 break;
2622 case ir_txf_ms:
2623 ir->lod_info.sample_index->accept(this);
2624 sample_index = this->result;
2625 sample_index_type = ir->lod_info.sample_index->type;
2626
2627 if (devinfo->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2628 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2629 else
2630 mcs = src_reg(0u);
2631 break;
2632 case ir_txd:
2633 ir->lod_info.grad.dPdx->accept(this);
2634 dPdx = this->result;
2635
2636 ir->lod_info.grad.dPdy->accept(this);
2637 dPdy = this->result;
2638
2639 lod_type = ir->lod_info.grad.dPdx->type;
2640 break;
2641 case ir_txb:
2642 case ir_lod:
2643 case ir_tg4:
2644 break;
2645 }
2646
2647 enum opcode opcode;
2648 switch (ir->op) {
2649 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2650 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2651 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2652 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2653 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2654 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2655 case ir_tg4: opcode = has_nonconstant_offset
2656 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2657 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2658 case ir_txb:
2659 unreachable("TXB is not valid for vertex shaders.");
2660 case ir_lod:
2661 unreachable("LOD is not valid for vertex shaders.");
2662 default:
2663 unreachable("Unrecognized tex op");
2664 }
2665
2666 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2667 opcode, dst_reg(this, ir->type));
2668
2669 if (ir->offset != NULL && !has_nonconstant_offset) {
2670 inst->offset =
2671 brw_texture_offset(ir->offset->as_constant()->value.i,
2672 ir->offset->type->vector_elements);
2673 }
2674
2675 /* Stuff the channel select bits in the top of the texture offset */
2676 if (ir->op == ir_tg4)
2677 inst->offset |= gather_channel(ir, sampler) << 16;
2678
2679 /* The message header is necessary for:
2680 * - Gen4 (always)
2681 * - Gen9+ for selecting SIMD4x2
2682 * - Texel offsets
2683 * - Gather channel selection
2684 * - Sampler indices too large to fit in a 4-bit value.
2685 */
2686 inst->header_size =
2687 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2688 inst->offset != 0 || ir->op == ir_tg4 ||
2689 is_high_sampler(sampler_reg)) ? 1 : 0;
2690 inst->base_mrf = 2;
2691 inst->mlen = inst->header_size + 1; /* always at least one */
2692 inst->dst.writemask = WRITEMASK_XYZW;
2693 inst->shadow_compare = ir->shadow_comparitor != NULL;
2694
2695 inst->src[1] = sampler_reg;
2696
2697 /* MRF for the first parameter */
2698 int param_base = inst->base_mrf + inst->header_size;
2699
2700 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2701 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2702 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2703 } else {
2704 /* Load the coordinate */
2705 /* FINISHME: gl_clamp_mask and saturate */
2706 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2707 int zero_mask = 0xf & ~coord_mask;
2708
2709 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2710 coordinate));
2711
2712 if (zero_mask != 0) {
2713 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2714 src_reg(0)));
2715 }
2716 /* Load the shadow comparitor */
2717 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2718 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2719 WRITEMASK_X),
2720 shadow_comparitor));
2721 inst->mlen++;
2722 }
2723
2724 /* Load the LOD info */
2725 if (ir->op == ir_tex || ir->op == ir_txl) {
2726 int mrf, writemask;
2727 if (devinfo->gen >= 5) {
2728 mrf = param_base + 1;
2729 if (ir->shadow_comparitor) {
2730 writemask = WRITEMASK_Y;
2731 /* mlen already incremented */
2732 } else {
2733 writemask = WRITEMASK_X;
2734 inst->mlen++;
2735 }
2736 } else /* devinfo->gen == 4 */ {
2737 mrf = param_base;
2738 writemask = WRITEMASK_W;
2739 }
2740 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2741 } else if (ir->op == ir_txf) {
2742 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2743 } else if (ir->op == ir_txf_ms) {
2744 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2745 sample_index));
2746 if (devinfo->gen >= 7) {
2747 /* MCS data is in the first channel of `mcs`, but we need to get it into
2748 * the .y channel of the second vec4 of params, so replicate .x across
2749 * the whole vec4 and then mask off everything except .y
2750 */
2751 mcs.swizzle = BRW_SWIZZLE_XXXX;
2752 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2753 mcs));
2754 }
2755 inst->mlen++;
2756 } else if (ir->op == ir_txd) {
2757 const glsl_type *type = lod_type;
2758
2759 if (devinfo->gen >= 5) {
2760 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2761 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2762 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2763 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2764 inst->mlen++;
2765
2766 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2767 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2768 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2769 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2770 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2771 inst->mlen++;
2772
2773 if (ir->shadow_comparitor) {
2774 emit(MOV(dst_reg(MRF, param_base + 2,
2775 ir->shadow_comparitor->type, WRITEMASK_Z),
2776 shadow_comparitor));
2777 }
2778 }
2779 } else /* devinfo->gen == 4 */ {
2780 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2781 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2782 inst->mlen += 2;
2783 }
2784 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2785 if (ir->shadow_comparitor) {
2786 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2787 shadow_comparitor));
2788 }
2789
2790 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2791 offset_value));
2792 inst->mlen++;
2793 }
2794 }
2795
2796 emit(inst);
2797
2798 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2799 * spec requires layers.
2800 */
2801 if (ir->op == ir_txs) {
2802 glsl_type const *type = ir->sampler->type;
2803 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2804 type->sampler_array) {
2805 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2806 writemask(inst->dst, WRITEMASK_Z),
2807 src_reg(inst->dst), src_reg(6));
2808 }
2809 }
2810
2811 if (devinfo->gen == 6 && ir->op == ir_tg4) {
2812 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2813 }
2814
2815 swizzle_result(ir, src_reg(inst->dst), sampler);
2816 }
2817
2818 /**
2819 * Apply workarounds for Gen6 gather with UINT/SINT
2820 */
2821 void
2822 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2823 {
2824 if (!wa)
2825 return;
2826
2827 int width = (wa & WA_8BIT) ? 8 : 16;
2828 dst_reg dst_f = dst;
2829 dst_f.type = BRW_REGISTER_TYPE_F;
2830
2831 /* Convert from UNORM to UINT */
2832 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2833 emit(MOV(dst, src_reg(dst_f)));
2834
2835 if (wa & WA_SIGN) {
2836 /* Reinterpret the UINT value as a signed INT value by
2837 * shifting the sign bit into place, then shifting back
2838 * preserving sign.
2839 */
2840 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2841 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2842 }
2843 }
2844
2845 /**
2846 * Set up the gather channel based on the swizzle, for gather4.
2847 */
2848 uint32_t
2849 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2850 {
2851 ir_constant *chan = ir->lod_info.component->as_constant();
2852 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2853 switch (swiz) {
2854 case SWIZZLE_X: return 0;
2855 case SWIZZLE_Y:
2856 /* gather4 sampler is broken for green channel on RG32F --
2857 * we must ask for blue instead.
2858 */
2859 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2860 return 2;
2861 return 1;
2862 case SWIZZLE_Z: return 2;
2863 case SWIZZLE_W: return 3;
2864 default:
2865 unreachable("Not reached"); /* zero, one swizzles handled already */
2866 }
2867 }
2868
2869 void
2870 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2871 {
2872 int s = key->tex.swizzles[sampler];
2873
2874 this->result = src_reg(this, ir->type);
2875 dst_reg swizzled_result(this->result);
2876
2877 if (ir->op == ir_query_levels) {
2878 /* # levels is in .w */
2879 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2880 emit(MOV(swizzled_result, orig_val));
2881 return;
2882 }
2883
2884 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2885 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2886 emit(MOV(swizzled_result, orig_val));
2887 return;
2888 }
2889
2890
2891 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2892 int swizzle[4] = {0};
2893
2894 for (int i = 0; i < 4; i++) {
2895 switch (GET_SWZ(s, i)) {
2896 case SWIZZLE_ZERO:
2897 zero_mask |= (1 << i);
2898 break;
2899 case SWIZZLE_ONE:
2900 one_mask |= (1 << i);
2901 break;
2902 default:
2903 copy_mask |= (1 << i);
2904 swizzle[i] = GET_SWZ(s, i);
2905 break;
2906 }
2907 }
2908
2909 if (copy_mask) {
2910 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2911 swizzled_result.writemask = copy_mask;
2912 emit(MOV(swizzled_result, orig_val));
2913 }
2914
2915 if (zero_mask) {
2916 swizzled_result.writemask = zero_mask;
2917 emit(MOV(swizzled_result, src_reg(0.0f)));
2918 }
2919
2920 if (one_mask) {
2921 swizzled_result.writemask = one_mask;
2922 emit(MOV(swizzled_result, src_reg(1.0f)));
2923 }
2924 }
2925
2926 void
2927 vec4_visitor::visit(ir_return *)
2928 {
2929 unreachable("not reached");
2930 }
2931
2932 void
2933 vec4_visitor::visit(ir_discard *)
2934 {
2935 unreachable("not reached");
2936 }
2937
2938 void
2939 vec4_visitor::visit(ir_if *ir)
2940 {
2941 /* Don't point the annotation at the if statement, because then it plus
2942 * the then and else blocks get printed.
2943 */
2944 this->base_ir = ir->condition;
2945
2946 if (devinfo->gen == 6) {
2947 emit_if_gen6(ir);
2948 } else {
2949 enum brw_predicate predicate;
2950 emit_bool_to_cond_code(ir->condition, &predicate);
2951 emit(IF(predicate));
2952 }
2953
2954 visit_instructions(&ir->then_instructions);
2955
2956 if (!ir->else_instructions.is_empty()) {
2957 this->base_ir = ir->condition;
2958 emit(BRW_OPCODE_ELSE);
2959
2960 visit_instructions(&ir->else_instructions);
2961 }
2962
2963 this->base_ir = ir->condition;
2964 emit(BRW_OPCODE_ENDIF);
2965 }
2966
2967 void
2968 vec4_visitor::visit(ir_emit_vertex *)
2969 {
2970 unreachable("not reached");
2971 }
2972
2973 void
2974 vec4_visitor::visit(ir_end_primitive *)
2975 {
2976 unreachable("not reached");
2977 }
2978
2979 void
2980 vec4_visitor::visit(ir_barrier *)
2981 {
2982 unreachable("not reached");
2983 }
2984
2985 void
2986 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2987 dst_reg dst, src_reg offset,
2988 src_reg src0, src_reg src1)
2989 {
2990 unsigned mlen = 0;
2991
2992 /* Set the atomic operation offset. */
2993 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2994 mlen++;
2995
2996 /* Set the atomic operation arguments. */
2997 if (src0.file != BAD_FILE) {
2998 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2999 mlen++;
3000 }
3001
3002 if (src1.file != BAD_FILE) {
3003 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3004 mlen++;
3005 }
3006
3007 /* Emit the instruction. Note that this maps to the normal SIMD8
3008 * untyped atomic message on Ivy Bridge, but that's OK because
3009 * unused channels will be masked out.
3010 */
3011 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3012 brw_message_reg(0),
3013 src_reg(surf_index), src_reg(atomic_op));
3014 inst->mlen = mlen;
3015 }
3016
3017 void
3018 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3019 src_reg offset)
3020 {
3021 /* Set the surface read offset. */
3022 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3023
3024 /* Emit the instruction. Note that this maps to the normal SIMD8
3025 * untyped surface read message, but that's OK because unused
3026 * channels will be masked out.
3027 */
3028 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3029 brw_message_reg(0),
3030 src_reg(surf_index), src_reg(1));
3031 inst->mlen = 1;
3032 }
3033
3034 void
3035 vec4_visitor::emit_ndc_computation()
3036 {
3037 /* Get the position */
3038 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3039
3040 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3041 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3042 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3043
3044 current_annotation = "NDC";
3045 dst_reg ndc_w = ndc;
3046 ndc_w.writemask = WRITEMASK_W;
3047 src_reg pos_w = pos;
3048 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3049 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3050
3051 dst_reg ndc_xyz = ndc;
3052 ndc_xyz.writemask = WRITEMASK_XYZ;
3053
3054 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3055 }
3056
3057 void
3058 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3059 {
3060 if (devinfo->gen < 6 &&
3061 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3062 key->userclip_active || devinfo->has_negative_rhw_bug)) {
3063 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3064 dst_reg header1_w = header1;
3065 header1_w.writemask = WRITEMASK_W;
3066
3067 emit(MOV(header1, 0u));
3068
3069 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3070 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3071
3072 current_annotation = "Point size";
3073 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3074 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3075 }
3076
3077 if (key->userclip_active) {
3078 current_annotation = "Clipping flags";
3079 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3080 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3081
3082 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3083 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3084 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3085
3086 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3087 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3088 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3089 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3090 }
3091
3092 /* i965 clipping workaround:
3093 * 1) Test for -ve rhw
3094 * 2) If set,
3095 * set ndc = (0,0,0,0)
3096 * set ucp[6] = 1
3097 *
3098 * Later, clipping will detect ucp[6] and ensure the primitive is
3099 * clipped against all fixed planes.
3100 */
3101 if (devinfo->has_negative_rhw_bug) {
3102 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3103 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3104 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3105 vec4_instruction *inst;
3106 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3107 inst->predicate = BRW_PREDICATE_NORMAL;
3108 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3109 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3110 inst->predicate = BRW_PREDICATE_NORMAL;
3111 }
3112
3113 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3114 } else if (devinfo->gen < 6) {
3115 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3116 } else {
3117 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3118 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3119 dst_reg reg_w = reg;
3120 reg_w.writemask = WRITEMASK_W;
3121 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3122 reg_as_src.type = reg_w.type;
3123 reg_as_src.swizzle = brw_swizzle_for_size(1);
3124 emit(MOV(reg_w, reg_as_src));
3125 }
3126 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3127 dst_reg reg_y = reg;
3128 reg_y.writemask = WRITEMASK_Y;
3129 reg_y.type = BRW_REGISTER_TYPE_D;
3130 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3131 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3132 }
3133 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3134 dst_reg reg_z = reg;
3135 reg_z.writemask = WRITEMASK_Z;
3136 reg_z.type = BRW_REGISTER_TYPE_D;
3137 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3138 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3139 }
3140 }
3141 }
3142
3143 void
3144 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3145 {
3146 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3147 *
3148 * "If a linked set of shaders forming the vertex stage contains no
3149 * static write to gl_ClipVertex or gl_ClipDistance, but the
3150 * application has requested clipping against user clip planes through
3151 * the API, then the coordinate written to gl_Position is used for
3152 * comparison against the user clip planes."
3153 *
3154 * This function is only called if the shader didn't write to
3155 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3156 * if the user wrote to it; otherwise we use gl_Position.
3157 */
3158 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3159 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3160 clip_vertex = VARYING_SLOT_POS;
3161 }
3162
3163 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3164 ++i) {
3165 reg.writemask = 1 << i;
3166 emit(DP4(reg,
3167 src_reg(output_reg[clip_vertex]),
3168 src_reg(this->userplane[i + offset])));
3169 }
3170 }
3171
3172 vec4_instruction *
3173 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3174 {
3175 assert(varying < VARYING_SLOT_MAX);
3176 assert(output_reg[varying].type == reg.type);
3177 current_annotation = output_reg_annotation[varying];
3178 /* Copy the register, saturating if necessary */
3179 return emit(MOV(reg, src_reg(output_reg[varying])));
3180 }
3181
3182 void
3183 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3184 {
3185 reg.type = BRW_REGISTER_TYPE_F;
3186 output_reg[varying].type = reg.type;
3187
3188 switch (varying) {
3189 case VARYING_SLOT_PSIZ:
3190 {
3191 /* PSIZ is always in slot 0, and is coupled with other flags. */
3192 current_annotation = "indices, point width, clip flags";
3193 emit_psiz_and_flags(reg);
3194 break;
3195 }
3196 case BRW_VARYING_SLOT_NDC:
3197 current_annotation = "NDC";
3198 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3199 break;
3200 case VARYING_SLOT_POS:
3201 current_annotation = "gl_Position";
3202 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3203 break;
3204 case VARYING_SLOT_EDGE:
3205 /* This is present when doing unfilled polygons. We're supposed to copy
3206 * the edge flag from the user-provided vertex array
3207 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3208 * of that attribute (starts as 1.0f). This is then used in clipping to
3209 * determine which edges should be drawn as wireframe.
3210 */
3211 current_annotation = "edge flag";
3212 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3213 glsl_type::float_type, WRITEMASK_XYZW))));
3214 break;
3215 case BRW_VARYING_SLOT_PAD:
3216 /* No need to write to this slot */
3217 break;
3218 case VARYING_SLOT_COL0:
3219 case VARYING_SLOT_COL1:
3220 case VARYING_SLOT_BFC0:
3221 case VARYING_SLOT_BFC1: {
3222 /* These built-in varyings are only supported in compatibility mode,
3223 * and we only support GS in core profile. So, this must be a vertex
3224 * shader.
3225 */
3226 assert(stage == MESA_SHADER_VERTEX);
3227 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3228 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3229 inst->saturate = true;
3230 break;
3231 }
3232
3233 default:
3234 emit_generic_urb_slot(reg, varying);
3235 break;
3236 }
3237 }
3238
3239 static int
3240 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3241 {
3242 if (devinfo->gen >= 6) {
3243 /* URB data written (does not include the message header reg) must
3244 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3245 * section 5.4.3.2.2: URB_INTERLEAVED.
3246 *
3247 * URB entries are allocated on a multiple of 1024 bits, so an
3248 * extra 128 bits written here to make the end align to 256 is
3249 * no problem.
3250 */
3251 if ((mlen % 2) != 1)
3252 mlen++;
3253 }
3254
3255 return mlen;
3256 }
3257
3258
3259 /**
3260 * Generates the VUE payload plus the necessary URB write instructions to
3261 * output it.
3262 *
3263 * The VUE layout is documented in Volume 2a.
3264 */
3265 void
3266 vec4_visitor::emit_vertex()
3267 {
3268 /* MRF 0 is reserved for the debugger, so start with message header
3269 * in MRF 1.
3270 */
3271 int base_mrf = 1;
3272 int mrf = base_mrf;
3273 /* In the process of generating our URB write message contents, we
3274 * may need to unspill a register or load from an array. Those
3275 * reads would use MRFs 14-15.
3276 */
3277 int max_usable_mrf = 13;
3278
3279 /* The following assertion verifies that max_usable_mrf causes an
3280 * even-numbered amount of URB write data, which will meet gen6's
3281 * requirements for length alignment.
3282 */
3283 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3284
3285 /* First mrf is the g0-based message header containing URB handles and
3286 * such.
3287 */
3288 emit_urb_write_header(mrf++);
3289
3290 if (devinfo->gen < 6) {
3291 emit_ndc_computation();
3292 }
3293
3294 /* Lower legacy ff and ClipVertex clipping to clip distances */
3295 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3296 current_annotation = "user clip distances";
3297
3298 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3299 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3300
3301 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3302 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3303 }
3304
3305 /* We may need to split this up into several URB writes, so do them in a
3306 * loop.
3307 */
3308 int slot = 0;
3309 bool complete = false;
3310 do {
3311 /* URB offset is in URB row increments, and each of our MRFs is half of
3312 * one of those, since we're doing interleaved writes.
3313 */
3314 int offset = slot / 2;
3315
3316 mrf = base_mrf + 1;
3317 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3318 emit_urb_slot(dst_reg(MRF, mrf++),
3319 prog_data->vue_map.slot_to_varying[slot]);
3320
3321 /* If this was max_usable_mrf, we can't fit anything more into this
3322 * URB WRITE.
3323 */
3324 if (mrf > max_usable_mrf) {
3325 slot++;
3326 break;
3327 }
3328 }
3329
3330 complete = slot >= prog_data->vue_map.num_slots;
3331 current_annotation = "URB write";
3332 vec4_instruction *inst = emit_urb_write_opcode(complete);
3333 inst->base_mrf = base_mrf;
3334 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3335 inst->offset += offset;
3336 } while(!complete);
3337 }
3338
3339
3340 src_reg
3341 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3342 src_reg *reladdr, int reg_offset)
3343 {
3344 /* Because we store the values to scratch interleaved like our
3345 * vertex data, we need to scale the vec4 index by 2.
3346 */
3347 int message_header_scale = 2;
3348
3349 /* Pre-gen6, the message header uses byte offsets instead of vec4
3350 * (16-byte) offset units.
3351 */
3352 if (devinfo->gen < 6)
3353 message_header_scale *= 16;
3354
3355 if (reladdr) {
3356 src_reg index = src_reg(this, glsl_type::int_type);
3357
3358 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3359 src_reg(reg_offset)));
3360 emit_before(block, inst, MUL(dst_reg(index), index,
3361 src_reg(message_header_scale)));
3362
3363 return index;
3364 } else {
3365 return src_reg(reg_offset * message_header_scale);
3366 }
3367 }
3368
3369 src_reg
3370 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3371 src_reg *reladdr, int reg_offset)
3372 {
3373 if (reladdr) {
3374 src_reg index = src_reg(this, glsl_type::int_type);
3375
3376 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3377 src_reg(reg_offset)));
3378
3379 /* Pre-gen6, the message header uses byte offsets instead of vec4
3380 * (16-byte) offset units.
3381 */
3382 if (devinfo->gen < 6) {
3383 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3384 }
3385
3386 return index;
3387 } else if (devinfo->gen >= 8) {
3388 /* Store the offset in a GRF so we can send-from-GRF. */
3389 src_reg offset = src_reg(this, glsl_type::int_type);
3390 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3391 return offset;
3392 } else {
3393 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3394 return src_reg(reg_offset * message_header_scale);
3395 }
3396 }
3397
3398 /**
3399 * Emits an instruction before @inst to load the value named by @orig_src
3400 * from scratch space at @base_offset to @temp.
3401 *
3402 * @base_offset is measured in 32-byte units (the size of a register).
3403 */
3404 void
3405 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3406 dst_reg temp, src_reg orig_src,
3407 int base_offset)
3408 {
3409 int reg_offset = base_offset + orig_src.reg_offset;
3410 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3411 reg_offset);
3412
3413 emit_before(block, inst, SCRATCH_READ(temp, index));
3414 }
3415
3416 /**
3417 * Emits an instruction after @inst to store the value to be written
3418 * to @orig_dst to scratch space at @base_offset, from @temp.
3419 *
3420 * @base_offset is measured in 32-byte units (the size of a register).
3421 */
3422 void
3423 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3424 int base_offset)
3425 {
3426 int reg_offset = base_offset + inst->dst.reg_offset;
3427 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3428 reg_offset);
3429
3430 /* Create a temporary register to store *inst's result in.
3431 *
3432 * We have to be careful in MOVing from our temporary result register in
3433 * the scratch write. If we swizzle from channels of the temporary that
3434 * weren't initialized, it will confuse live interval analysis, which will
3435 * make spilling fail to make progress.
3436 */
3437 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3438 inst->dst.type),
3439 brw_swizzle_for_mask(inst->dst.writemask));
3440 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3441 inst->dst.writemask));
3442 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3443 write->predicate = inst->predicate;
3444 write->ir = inst->ir;
3445 write->annotation = inst->annotation;
3446 inst->insert_after(block, write);
3447
3448 inst->dst.file = temp.file;
3449 inst->dst.reg = temp.reg;
3450 inst->dst.reg_offset = temp.reg_offset;
3451 inst->dst.reladdr = NULL;
3452 }
3453
3454 /**
3455 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3456 * adds the scratch read(s) before \p inst. The function also checks for
3457 * recursive reladdr scratch accesses, issuing the corresponding scratch
3458 * loads and rewriting reladdr references accordingly.
3459 *
3460 * \return \p src if it did not require a scratch load, otherwise, the
3461 * register holding the result of the scratch load that the caller should
3462 * use to rewrite src.
3463 */
3464 src_reg
3465 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3466 vec4_instruction *inst, src_reg src)
3467 {
3468 /* Resolve recursive reladdr scratch access by calling ourselves
3469 * with src.reladdr
3470 */
3471 if (src.reladdr)
3472 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3473 *src.reladdr);
3474
3475 /* Now handle scratch access on src */
3476 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3477 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3478 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3479 src.reg = temp.reg;
3480 src.reg_offset = temp.reg_offset;
3481 src.reladdr = NULL;
3482 }
3483
3484 return src;
3485 }
3486
3487 /**
3488 * We can't generally support array access in GRF space, because a
3489 * single instruction's destination can only span 2 contiguous
3490 * registers. So, we send all GRF arrays that get variable index
3491 * access to scratch space.
3492 */
3493 void
3494 vec4_visitor::move_grf_array_access_to_scratch()
3495 {
3496 int scratch_loc[this->alloc.count];
3497 memset(scratch_loc, -1, sizeof(scratch_loc));
3498
3499 /* First, calculate the set of virtual GRFs that need to be punted
3500 * to scratch due to having any array access on them, and where in
3501 * scratch.
3502 */
3503 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3504 if (inst->dst.file == GRF && inst->dst.reladdr) {
3505 if (scratch_loc[inst->dst.reg] == -1) {
3506 scratch_loc[inst->dst.reg] = last_scratch;
3507 last_scratch += this->alloc.sizes[inst->dst.reg];
3508 }
3509
3510 for (src_reg *iter = inst->dst.reladdr;
3511 iter->reladdr;
3512 iter = iter->reladdr) {
3513 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3514 scratch_loc[iter->reg] = last_scratch;
3515 last_scratch += this->alloc.sizes[iter->reg];
3516 }
3517 }
3518 }
3519
3520 for (int i = 0 ; i < 3; i++) {
3521 for (src_reg *iter = &inst->src[i];
3522 iter->reladdr;
3523 iter = iter->reladdr) {
3524 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3525 scratch_loc[iter->reg] = last_scratch;
3526 last_scratch += this->alloc.sizes[iter->reg];
3527 }
3528 }
3529 }
3530 }
3531
3532 /* Now, for anything that will be accessed through scratch, rewrite
3533 * it to load/store. Note that this is a _safe list walk, because
3534 * we may generate a new scratch_write instruction after the one
3535 * we're processing.
3536 */
3537 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3538 /* Set up the annotation tracking for new generated instructions. */
3539 base_ir = inst->ir;
3540 current_annotation = inst->annotation;
3541
3542 /* First handle scratch access on the dst. Notice we have to handle
3543 * the case where the dst's reladdr also points to scratch space.
3544 */
3545 if (inst->dst.reladdr)
3546 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3547 *inst->dst.reladdr);
3548
3549 /* Now that we have handled any (possibly recursive) reladdr scratch
3550 * accesses for dst we can safely do the scratch write for dst itself
3551 */
3552 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3553 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3554
3555 /* Now handle scratch access on any src. In this case, since inst->src[i]
3556 * already is a src_reg, we can just call emit_resolve_reladdr with
3557 * inst->src[i] and it will take care of handling scratch loads for
3558 * both src and src.reladdr (recursively).
3559 */
3560 for (int i = 0 ; i < 3; i++) {
3561 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3562 inst->src[i]);
3563 }
3564 }
3565 }
3566
3567 /**
3568 * Emits an instruction before @inst to load the value named by @orig_src
3569 * from the pull constant buffer (surface) at @base_offset to @temp.
3570 */
3571 void
3572 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3573 dst_reg temp, src_reg orig_src,
3574 int base_offset)
3575 {
3576 int reg_offset = base_offset + orig_src.reg_offset;
3577 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3578 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3579 reg_offset);
3580
3581 emit_pull_constant_load_reg(temp,
3582 index,
3583 offset,
3584 block, inst);
3585 }
3586
3587 /**
3588 * Implements array access of uniforms by inserting a
3589 * PULL_CONSTANT_LOAD instruction.
3590 *
3591 * Unlike temporary GRF array access (where we don't support it due to
3592 * the difficulty of doing relative addressing on instruction
3593 * destinations), we could potentially do array access of uniforms
3594 * that were loaded in GRF space as push constants. In real-world
3595 * usage we've seen, though, the arrays being used are always larger
3596 * than we could load as push constants, so just always move all
3597 * uniform array access out to a pull constant buffer.
3598 */
3599 void
3600 vec4_visitor::move_uniform_array_access_to_pull_constants()
3601 {
3602 int pull_constant_loc[this->uniforms];
3603 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3604 bool nested_reladdr;
3605
3606 /* Walk through and find array access of uniforms. Put a copy of that
3607 * uniform in the pull constant buffer.
3608 *
3609 * Note that we don't move constant-indexed accesses to arrays. No
3610 * testing has been done of the performance impact of this choice.
3611 */
3612 do {
3613 nested_reladdr = false;
3614
3615 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3616 for (int i = 0 ; i < 3; i++) {
3617 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3618 continue;
3619
3620 int uniform = inst->src[i].reg;
3621
3622 if (inst->src[i].reladdr->reladdr)
3623 nested_reladdr = true; /* will need another pass */
3624
3625 /* If this array isn't already present in the pull constant buffer,
3626 * add it.
3627 */
3628 if (pull_constant_loc[uniform] == -1) {
3629 const gl_constant_value **values =
3630 &stage_prog_data->param[uniform * 4];
3631
3632 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3633
3634 assert(uniform < uniform_array_size);
3635 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3636 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3637 = values[j];
3638 }
3639 }
3640
3641 /* Set up the annotation tracking for new generated instructions. */
3642 base_ir = inst->ir;
3643 current_annotation = inst->annotation;
3644
3645 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3646
3647 emit_pull_constant_load(block, inst, temp, inst->src[i],
3648 pull_constant_loc[uniform]);
3649
3650 inst->src[i].file = temp.file;
3651 inst->src[i].reg = temp.reg;
3652 inst->src[i].reg_offset = temp.reg_offset;
3653 inst->src[i].reladdr = NULL;
3654 }
3655 }
3656 } while (nested_reladdr);
3657
3658 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3659 * no need to track them as larger-than-vec4 objects. This will be
3660 * relied on in cutting out unused uniform vectors from push
3661 * constants.
3662 */
3663 split_uniform_registers();
3664 }
3665
3666 void
3667 vec4_visitor::resolve_ud_negate(src_reg *reg)
3668 {
3669 if (reg->type != BRW_REGISTER_TYPE_UD ||
3670 !reg->negate)
3671 return;
3672
3673 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3674 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3675 *reg = temp;
3676 }
3677
3678 /**
3679 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3680 *
3681 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3682 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3683 */
3684 void
3685 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3686 {
3687 assert(devinfo->gen <= 5);
3688
3689 if (!rvalue->type->is_boolean())
3690 return;
3691
3692 src_reg and_result = src_reg(this, rvalue->type);
3693 src_reg neg_result = src_reg(this, rvalue->type);
3694 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3695 emit(MOV(dst_reg(neg_result), negate(and_result)));
3696 *reg = neg_result;
3697 }
3698
3699 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3700 void *log_data,
3701 struct gl_program *prog,
3702 const struct brw_vue_prog_key *key,
3703 struct brw_vue_prog_data *prog_data,
3704 struct gl_shader_program *shader_prog,
3705 gl_shader_stage stage,
3706 void *mem_ctx,
3707 bool no_spills,
3708 int shader_time_index)
3709 : backend_shader(compiler, log_data, mem_ctx,
3710 shader_prog, prog, &prog_data->base, stage),
3711 key(key),
3712 prog_data(prog_data),
3713 sanity_param_count(0),
3714 fail_msg(NULL),
3715 first_non_payload_grf(0),
3716 need_all_constants_in_pull_buffer(false),
3717 no_spills(no_spills),
3718 shader_time_index(shader_time_index),
3719 last_scratch(0)
3720 {
3721 this->failed = false;
3722
3723 this->base_ir = NULL;
3724 this->current_annotation = NULL;
3725 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3726
3727 this->variable_ht = hash_table_ctor(0,
3728 hash_table_pointer_hash,
3729 hash_table_pointer_compare);
3730
3731 this->virtual_grf_start = NULL;
3732 this->virtual_grf_end = NULL;
3733 this->live_intervals = NULL;
3734
3735 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3736
3737 this->uniforms = 0;
3738
3739 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3740 * at least one. See setup_uniforms() in brw_vec4.cpp.
3741 */
3742 this->uniform_array_size = 1;
3743 if (prog_data) {
3744 this->uniform_array_size =
3745 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3746 }
3747
3748 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3749 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3750 }
3751
3752 vec4_visitor::~vec4_visitor()
3753 {
3754 hash_table_dtor(this->variable_ht);
3755 }
3756
3757
3758 void
3759 vec4_visitor::fail(const char *format, ...)
3760 {
3761 va_list va;
3762 char *msg;
3763
3764 if (failed)
3765 return;
3766
3767 failed = true;
3768
3769 va_start(va, format);
3770 msg = ralloc_vasprintf(mem_ctx, format, va);
3771 va_end(va);
3772 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3773
3774 this->fail_msg = msg;
3775
3776 if (debug_enabled) {
3777 fprintf(stderr, "%s", msg);
3778 }
3779 }
3780
3781 } /* namespace brw */