i965: Store a key_tex pointer in vec4_visitor.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(const src_reg &src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::resolve_source_modifiers(const src_reg &src)
317 {
318 if (!src.abs && !src.negate)
319 return src;
320
321 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
322 resolved.type = src.type;
323 emit(MOV(resolved, src));
324
325 return src_reg(resolved);
326 }
327
328 src_reg
329 vec4_visitor::fix_math_operand(const src_reg &src)
330 {
331 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
332 return src;
333
334 /* The gen6 math instruction ignores the source modifiers --
335 * swizzle, abs, negate, and at least some parts of the register
336 * region description.
337 *
338 * Rather than trying to enumerate all these cases, *always* expand the
339 * operand to a temp GRF for gen6.
340 *
341 * For gen7, keep the operand as-is, except if immediate, which gen7 still
342 * can't use.
343 */
344
345 if (devinfo->gen == 7 && src.file != IMM)
346 return src;
347
348 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
349 expanded.type = src.type;
350 emit(MOV(expanded, src));
351 return src_reg(expanded);
352 }
353
354 vec4_instruction *
355 vec4_visitor::emit_math(enum opcode opcode,
356 const dst_reg &dst,
357 const src_reg &src0, const src_reg &src1)
358 {
359 vec4_instruction *math =
360 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
361
362 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
363 /* MATH on Gen6 must be align1, so we can't do writemasks. */
364 math->dst = dst_reg(this, glsl_type::vec4_type);
365 math->dst.type = dst.type;
366 math = emit(MOV(dst, src_reg(math->dst)));
367 } else if (devinfo->gen < 6) {
368 math->base_mrf = 1;
369 math->mlen = src1.file == BAD_FILE ? 1 : 2;
370 }
371
372 return math;
373 }
374
375 void
376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
377 {
378 if (devinfo->gen < 7) {
379 unreachable("ir_unop_pack_half_2x16 should be lowered");
380 }
381
382 assert(dst.type == BRW_REGISTER_TYPE_UD);
383 assert(src0.type == BRW_REGISTER_TYPE_F);
384
385 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
386 *
387 * Because this instruction does not have a 16-bit floating-point type,
388 * the destination data type must be Word (W).
389 *
390 * The destination must be DWord-aligned and specify a horizontal stride
391 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
392 * each destination channel and the upper word is not modified.
393 *
394 * The above restriction implies that the f32to16 instruction must use
395 * align1 mode, because only in align1 mode is it possible to specify
396 * horizontal stride. We choose here to defy the hardware docs and emit
397 * align16 instructions.
398 *
399 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
400 * instructions. I was partially successful in that the code passed all
401 * tests. However, the code was dubiously correct and fragile, and the
402 * tests were not harsh enough to probe that frailty. Not trusting the
403 * code, I chose instead to remain in align16 mode in defiance of the hw
404 * docs).
405 *
406 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
407 * simulator, emitting a f32to16 in align16 mode with UD as destination
408 * data type is safe. The behavior differs from that specified in the PRM
409 * in that the upper word of each destination channel is cleared to 0.
410 */
411
412 dst_reg tmp_dst(this, glsl_type::uvec2_type);
413 src_reg tmp_src(tmp_dst);
414
415 #if 0
416 /* Verify the undocumented behavior on which the following instructions
417 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
418 * then the result of the bit-or instruction below will be incorrect.
419 *
420 * You should inspect the disasm output in order to verify that the MOV is
421 * not optimized away.
422 */
423 emit(MOV(tmp_dst, src_reg(0x12345678u)));
424 #endif
425
426 /* Give tmp the form below, where "." means untouched.
427 *
428 * w z y x w z y x
429 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
430 *
431 * That the upper word of each write-channel be 0 is required for the
432 * following bit-shift and bit-or instructions to work. Note that this
433 * relies on the undocumented hardware behavior mentioned above.
434 */
435 tmp_dst.writemask = WRITEMASK_XY;
436 emit(F32TO16(tmp_dst, src0));
437
438 /* Give the write-channels of dst the form:
439 * 0xhhhh0000
440 */
441 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
442 emit(SHL(dst, tmp_src, src_reg(16u)));
443
444 /* Finally, give the write-channels of dst the form of packHalf2x16's
445 * output:
446 * 0xhhhhllll
447 */
448 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
449 emit(OR(dst, src_reg(dst), tmp_src));
450 }
451
452 void
453 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
454 {
455 if (devinfo->gen < 7) {
456 unreachable("ir_unop_unpack_half_2x16 should be lowered");
457 }
458
459 assert(dst.type == BRW_REGISTER_TYPE_F);
460 assert(src0.type == BRW_REGISTER_TYPE_UD);
461
462 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
463 *
464 * Because this instruction does not have a 16-bit floating-point type,
465 * the source data type must be Word (W). The destination type must be
466 * F (Float).
467 *
468 * To use W as the source data type, we must adjust horizontal strides,
469 * which is only possible in align1 mode. All my [chadv] attempts at
470 * emitting align1 instructions for unpackHalf2x16 failed to pass the
471 * Piglit tests, so I gave up.
472 *
473 * I've verified that, on gen7 hardware and the simulator, it is safe to
474 * emit f16to32 in align16 mode with UD as source data type.
475 */
476
477 dst_reg tmp_dst(this, glsl_type::uvec2_type);
478 src_reg tmp_src(tmp_dst);
479
480 tmp_dst.writemask = WRITEMASK_X;
481 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
482
483 tmp_dst.writemask = WRITEMASK_Y;
484 emit(SHR(tmp_dst, src0, src_reg(16u)));
485
486 dst.writemask = WRITEMASK_XY;
487 emit(F16TO32(dst, tmp_src));
488 }
489
490 void
491 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
492 {
493 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
494 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
495 * is not suitable to generate the shift values, but we can use the packed
496 * vector float and a type-converting MOV.
497 */
498 dst_reg shift(this, glsl_type::uvec4_type);
499 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
500
501 dst_reg shifted(this, glsl_type::uvec4_type);
502 src0.swizzle = BRW_SWIZZLE_XXXX;
503 emit(SHR(shifted, src0, src_reg(shift)));
504
505 shifted.type = BRW_REGISTER_TYPE_UB;
506 dst_reg f(this, glsl_type::vec4_type);
507 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
508
509 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
510 }
511
512 void
513 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
514 {
515 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
516 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
517 * is not suitable to generate the shift values, but we can use the packed
518 * vector float and a type-converting MOV.
519 */
520 dst_reg shift(this, glsl_type::uvec4_type);
521 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
522
523 dst_reg shifted(this, glsl_type::uvec4_type);
524 src0.swizzle = BRW_SWIZZLE_XXXX;
525 emit(SHR(shifted, src0, src_reg(shift)));
526
527 shifted.type = BRW_REGISTER_TYPE_B;
528 dst_reg f(this, glsl_type::vec4_type);
529 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
533
534 dst_reg max(this, glsl_type::vec4_type);
535 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
536 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
537 }
538
539 void
540 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
541 {
542 dst_reg saturated(this, glsl_type::vec4_type);
543 vec4_instruction *inst = emit(MOV(saturated, src0));
544 inst->saturate = true;
545
546 dst_reg scaled(this, glsl_type::vec4_type);
547 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
548
549 dst_reg rounded(this, glsl_type::vec4_type);
550 emit(RNDE(rounded, src_reg(scaled)));
551
552 dst_reg u(this, glsl_type::uvec4_type);
553 emit(MOV(u, src_reg(rounded)));
554
555 src_reg bytes(u);
556 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
557 }
558
559 void
560 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
561 {
562 dst_reg max(this, glsl_type::vec4_type);
563 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
564
565 dst_reg min(this, glsl_type::vec4_type);
566 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
567
568 dst_reg scaled(this, glsl_type::vec4_type);
569 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
570
571 dst_reg rounded(this, glsl_type::vec4_type);
572 emit(RNDE(rounded, src_reg(scaled)));
573
574 dst_reg i(this, glsl_type::ivec4_type);
575 emit(MOV(i, src_reg(rounded)));
576
577 src_reg bytes(i);
578 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
579 }
580
581 void
582 vec4_visitor::visit_instructions(const exec_list *list)
583 {
584 foreach_in_list(ir_instruction, ir, list) {
585 base_ir = ir;
586 ir->accept(this);
587 }
588 }
589
590 /**
591 * Returns the minimum number of vec4 elements needed to pack a type.
592 *
593 * For simple types, it will return 1 (a single vec4); for matrices, the
594 * number of columns; for array and struct, the sum of the vec4_size of
595 * each of its elements; and for sampler and atomic, zero.
596 *
597 * This method is useful to calculate how much register space is needed to
598 * store a particular type.
599 */
600 extern "C" int
601 type_size_vec4(const struct glsl_type *type)
602 {
603 unsigned int i;
604 int size;
605
606 switch (type->base_type) {
607 case GLSL_TYPE_UINT:
608 case GLSL_TYPE_INT:
609 case GLSL_TYPE_FLOAT:
610 case GLSL_TYPE_BOOL:
611 if (type->is_matrix()) {
612 return type->matrix_columns;
613 } else {
614 /* Regardless of size of vector, it gets a vec4. This is bad
615 * packing for things like floats, but otherwise arrays become a
616 * mess. Hopefully a later pass over the code can pack scalars
617 * down if appropriate.
618 */
619 return 1;
620 }
621 case GLSL_TYPE_ARRAY:
622 assert(type->length > 0);
623 return type_size_vec4(type->fields.array) * type->length;
624 case GLSL_TYPE_STRUCT:
625 size = 0;
626 for (i = 0; i < type->length; i++) {
627 size += type_size_vec4(type->fields.structure[i].type);
628 }
629 return size;
630 case GLSL_TYPE_SUBROUTINE:
631 return 1;
632
633 case GLSL_TYPE_SAMPLER:
634 /* Samplers take up no register space, since they're baked in at
635 * link time.
636 */
637 return 0;
638 case GLSL_TYPE_ATOMIC_UINT:
639 return 0;
640 case GLSL_TYPE_IMAGE:
641 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
642 case GLSL_TYPE_VOID:
643 case GLSL_TYPE_DOUBLE:
644 case GLSL_TYPE_ERROR:
645 case GLSL_TYPE_INTERFACE:
646 unreachable("not reached");
647 }
648
649 return 0;
650 }
651
652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
653 {
654 init();
655
656 this->file = GRF;
657 this->reg = v->alloc.allocate(type_size_vec4(type));
658
659 if (type->is_array() || type->is_record()) {
660 this->swizzle = BRW_SWIZZLE_NOOP;
661 } else {
662 this->swizzle = brw_swizzle_for_size(type->vector_elements);
663 }
664
665 this->type = brw_type_for_base_type(type);
666 }
667
668 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
669 {
670 assert(size > 0);
671
672 init();
673
674 this->file = GRF;
675 this->reg = v->alloc.allocate(type_size_vec4(type) * size);
676
677 this->swizzle = BRW_SWIZZLE_NOOP;
678
679 this->type = brw_type_for_base_type(type);
680 }
681
682 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
683 {
684 init();
685
686 this->file = GRF;
687 this->reg = v->alloc.allocate(type_size_vec4(type));
688
689 if (type->is_array() || type->is_record()) {
690 this->writemask = WRITEMASK_XYZW;
691 } else {
692 this->writemask = (1 << type->vector_elements) - 1;
693 }
694
695 this->type = brw_type_for_base_type(type);
696 }
697
698 void
699 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
700 const gl_constant_value *values,
701 unsigned n)
702 {
703 static const gl_constant_value zero = { 0 };
704
705 assert(param_offset % 4 == 0);
706
707 for (unsigned i = 0; i < n; ++i)
708 stage_prog_data->param[param_offset + i] = &values[i];
709
710 for (unsigned i = n; i < 4; ++i)
711 stage_prog_data->param[param_offset + i] = &zero;
712
713 uniform_vector_size[param_offset / 4] = n;
714 }
715
716 /* Our support for uniforms is piggy-backed on the struct
717 * gl_fragment_program, because that's where the values actually
718 * get stored, rather than in some global gl_shader_program uniform
719 * store.
720 */
721 void
722 vec4_visitor::setup_uniform_values(ir_variable *ir)
723 {
724 int namelen = strlen(ir->name);
725
726 /* The data for our (non-builtin) uniforms is stored in a series of
727 * gl_uniform_driver_storage structs for each subcomponent that
728 * glGetUniformLocation() could name. We know it's been set up in the same
729 * order we'd walk the type, so walk the list of storage and find anything
730 * with our name, or the prefix of a component that starts with our name.
731 */
732 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
733 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
734
735 if (storage->builtin)
736 continue;
737
738 if (strncmp(ir->name, storage->name, namelen) != 0 ||
739 (storage->name[namelen] != 0 &&
740 storage->name[namelen] != '.' &&
741 storage->name[namelen] != '[')) {
742 continue;
743 }
744
745 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
746 storage->type->matrix_columns);
747 const unsigned vector_size = storage->type->vector_elements;
748
749 for (unsigned s = 0; s < vector_count; s++) {
750 setup_vec4_uniform_value(uniforms * 4,
751 &storage->storage[s * vector_size],
752 vector_size);
753 uniforms++;
754 }
755 }
756 }
757
758 /* Our support for builtin uniforms is even scarier than non-builtin.
759 * It sits on top of the PROG_STATE_VAR parameters that are
760 * automatically updated from GL context state.
761 */
762 void
763 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
764 {
765 const ir_state_slot *const slots = ir->get_state_slots();
766 assert(slots != NULL);
767
768 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
769 /* This state reference has already been setup by ir_to_mesa,
770 * but we'll get the same index back here. We can reference
771 * ParameterValues directly, since unlike brw_fs.cpp, we never
772 * add new state references during compile.
773 */
774 int index = _mesa_add_state_reference(this->prog->Parameters,
775 (gl_state_index *)slots[i].tokens);
776 gl_constant_value *values =
777 &this->prog->Parameters->ParameterValues[index][0];
778
779 assert(this->uniforms < uniform_array_size);
780
781 for (unsigned j = 0; j < 4; j++)
782 stage_prog_data->param[this->uniforms * 4 + j] =
783 &values[GET_SWZ(slots[i].swizzle, j)];
784
785 this->uniform_vector_size[this->uniforms] =
786 (ir->type->is_scalar() || ir->type->is_vector() ||
787 ir->type->is_matrix() ? ir->type->vector_elements : 4);
788
789 this->uniforms++;
790 }
791 }
792
793 dst_reg *
794 vec4_visitor::variable_storage(ir_variable *var)
795 {
796 return (dst_reg *)hash_table_find(this->variable_ht, var);
797 }
798
799 void
800 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
801 enum brw_predicate *predicate)
802 {
803 ir_expression *expr = ir->as_expression();
804
805 *predicate = BRW_PREDICATE_NORMAL;
806
807 if (expr && expr->operation != ir_binop_ubo_load) {
808 src_reg op[3];
809 vec4_instruction *inst;
810
811 assert(expr->get_num_operands() <= 3);
812 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
813 expr->operands[i]->accept(this);
814 op[i] = this->result;
815
816 resolve_ud_negate(&op[i]);
817 }
818
819 switch (expr->operation) {
820 case ir_unop_logic_not:
821 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
822 inst->conditional_mod = BRW_CONDITIONAL_Z;
823 break;
824
825 case ir_binop_logic_xor:
826 if (devinfo->gen <= 5) {
827 src_reg temp = src_reg(this, ir->type);
828 emit(XOR(dst_reg(temp), op[0], op[1]));
829 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
830 } else {
831 inst = emit(XOR(dst_null_d(), op[0], op[1]));
832 }
833 inst->conditional_mod = BRW_CONDITIONAL_NZ;
834 break;
835
836 case ir_binop_logic_or:
837 if (devinfo->gen <= 5) {
838 src_reg temp = src_reg(this, ir->type);
839 emit(OR(dst_reg(temp), op[0], op[1]));
840 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
841 } else {
842 inst = emit(OR(dst_null_d(), op[0], op[1]));
843 }
844 inst->conditional_mod = BRW_CONDITIONAL_NZ;
845 break;
846
847 case ir_binop_logic_and:
848 if (devinfo->gen <= 5) {
849 src_reg temp = src_reg(this, ir->type);
850 emit(AND(dst_reg(temp), op[0], op[1]));
851 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
852 } else {
853 inst = emit(AND(dst_null_d(), op[0], op[1]));
854 }
855 inst->conditional_mod = BRW_CONDITIONAL_NZ;
856 break;
857
858 case ir_unop_f2b:
859 if (devinfo->gen >= 6) {
860 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
861 } else {
862 inst = emit(MOV(dst_null_f(), op[0]));
863 inst->conditional_mod = BRW_CONDITIONAL_NZ;
864 }
865 break;
866
867 case ir_unop_i2b:
868 if (devinfo->gen >= 6) {
869 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
870 } else {
871 inst = emit(MOV(dst_null_d(), op[0]));
872 inst->conditional_mod = BRW_CONDITIONAL_NZ;
873 }
874 break;
875
876 case ir_binop_all_equal:
877 if (devinfo->gen <= 5) {
878 resolve_bool_comparison(expr->operands[0], &op[0]);
879 resolve_bool_comparison(expr->operands[1], &op[1]);
880 }
881 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
882 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
883 break;
884
885 case ir_binop_any_nequal:
886 if (devinfo->gen <= 5) {
887 resolve_bool_comparison(expr->operands[0], &op[0]);
888 resolve_bool_comparison(expr->operands[1], &op[1]);
889 }
890 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
891 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
892 break;
893
894 case ir_unop_any:
895 if (devinfo->gen <= 5) {
896 resolve_bool_comparison(expr->operands[0], &op[0]);
897 }
898 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
899 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
900 break;
901
902 case ir_binop_greater:
903 case ir_binop_gequal:
904 case ir_binop_less:
905 case ir_binop_lequal:
906 case ir_binop_equal:
907 case ir_binop_nequal:
908 if (devinfo->gen <= 5) {
909 resolve_bool_comparison(expr->operands[0], &op[0]);
910 resolve_bool_comparison(expr->operands[1], &op[1]);
911 }
912 emit(CMP(dst_null_d(), op[0], op[1],
913 brw_conditional_for_comparison(expr->operation)));
914 break;
915
916 case ir_triop_csel: {
917 /* Expand the boolean condition into the flag register. */
918 inst = emit(MOV(dst_null_d(), op[0]));
919 inst->conditional_mod = BRW_CONDITIONAL_NZ;
920
921 /* Select which boolean to return. */
922 dst_reg temp(this, expr->operands[1]->type);
923 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
924 inst->predicate = BRW_PREDICATE_NORMAL;
925
926 /* Expand the result to a condition code. */
927 inst = emit(MOV(dst_null_d(), src_reg(temp)));
928 inst->conditional_mod = BRW_CONDITIONAL_NZ;
929 break;
930 }
931
932 default:
933 unreachable("not reached");
934 }
935 return;
936 }
937
938 ir->accept(this);
939
940 resolve_ud_negate(&this->result);
941
942 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
943 inst->conditional_mod = BRW_CONDITIONAL_NZ;
944 }
945
946 /**
947 * Emit a gen6 IF statement with the comparison folded into the IF
948 * instruction.
949 */
950 void
951 vec4_visitor::emit_if_gen6(ir_if *ir)
952 {
953 ir_expression *expr = ir->condition->as_expression();
954
955 if (expr && expr->operation != ir_binop_ubo_load) {
956 src_reg op[3];
957 dst_reg temp;
958
959 assert(expr->get_num_operands() <= 3);
960 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
961 expr->operands[i]->accept(this);
962 op[i] = this->result;
963 }
964
965 switch (expr->operation) {
966 case ir_unop_logic_not:
967 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
968 return;
969
970 case ir_binop_logic_xor:
971 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_binop_logic_or:
975 temp = dst_reg(this, glsl_type::bool_type);
976 emit(OR(temp, op[0], op[1]));
977 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
978 return;
979
980 case ir_binop_logic_and:
981 temp = dst_reg(this, glsl_type::bool_type);
982 emit(AND(temp, op[0], op[1]));
983 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
984 return;
985
986 case ir_unop_f2b:
987 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
988 return;
989
990 case ir_unop_i2b:
991 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
992 return;
993
994 case ir_binop_greater:
995 case ir_binop_gequal:
996 case ir_binop_less:
997 case ir_binop_lequal:
998 case ir_binop_equal:
999 case ir_binop_nequal:
1000 emit(IF(op[0], op[1],
1001 brw_conditional_for_comparison(expr->operation)));
1002 return;
1003
1004 case ir_binop_all_equal:
1005 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1006 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1007 return;
1008
1009 case ir_binop_any_nequal:
1010 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1011 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1012 return;
1013
1014 case ir_unop_any:
1015 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1016 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1017 return;
1018
1019 case ir_triop_csel: {
1020 /* Expand the boolean condition into the flag register. */
1021 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1022 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1023
1024 /* Select which boolean to return. */
1025 dst_reg temp(this, expr->operands[1]->type);
1026 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1027 inst->predicate = BRW_PREDICATE_NORMAL;
1028
1029 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1030 return;
1031 }
1032
1033 default:
1034 unreachable("not reached");
1035 }
1036 return;
1037 }
1038
1039 ir->condition->accept(this);
1040
1041 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_variable *ir)
1046 {
1047 dst_reg *reg = NULL;
1048
1049 if (variable_storage(ir))
1050 return;
1051
1052 switch (ir->data.mode) {
1053 case ir_var_shader_in:
1054 assert(ir->data.location != -1);
1055 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1056 break;
1057
1058 case ir_var_shader_out:
1059 assert(ir->data.location != -1);
1060 reg = new(mem_ctx) dst_reg(this, ir->type);
1061
1062 for (int i = 0; i < type_size_vec4(ir->type); i++) {
1063 output_reg[ir->data.location + i] = *reg;
1064 output_reg[ir->data.location + i].reg_offset = i;
1065 output_reg_annotation[ir->data.location + i] = ir->name;
1066 }
1067 break;
1068
1069 case ir_var_auto:
1070 case ir_var_temporary:
1071 reg = new(mem_ctx) dst_reg(this, ir->type);
1072 break;
1073
1074 case ir_var_uniform:
1075 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1076
1077 /* Thanks to the lower_ubo_reference pass, we will see only
1078 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1079 * variables, so no need for them to be in variable_ht.
1080 *
1081 * Some uniforms, such as samplers and atomic counters, have no actual
1082 * storage, so we should ignore them.
1083 */
1084 if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1085 return;
1086
1087 /* Track how big the whole uniform variable is, in case we need to put a
1088 * copy of its data into pull constants for array access.
1089 */
1090 assert(this->uniforms < uniform_array_size);
1091 this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1092
1093 if (!strncmp(ir->name, "gl_", 3)) {
1094 setup_builtin_uniform_values(ir);
1095 } else {
1096 setup_uniform_values(ir);
1097 }
1098 break;
1099
1100 case ir_var_system_value:
1101 reg = make_reg_for_system_value(ir->data.location, ir->type);
1102 break;
1103
1104 default:
1105 unreachable("not reached");
1106 }
1107
1108 reg->type = brw_type_for_base_type(ir->type);
1109 hash_table_insert(this->variable_ht, reg, ir);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop *ir)
1114 {
1115 /* We don't want debugging output to print the whole body of the
1116 * loop as the annotation.
1117 */
1118 this->base_ir = NULL;
1119
1120 emit(BRW_OPCODE_DO);
1121
1122 visit_instructions(&ir->body_instructions);
1123
1124 emit(BRW_OPCODE_WHILE);
1125 }
1126
1127 void
1128 vec4_visitor::visit(ir_loop_jump *ir)
1129 {
1130 switch (ir->mode) {
1131 case ir_loop_jump::jump_break:
1132 emit(BRW_OPCODE_BREAK);
1133 break;
1134 case ir_loop_jump::jump_continue:
1135 emit(BRW_OPCODE_CONTINUE);
1136 break;
1137 }
1138 }
1139
1140
1141 void
1142 vec4_visitor::visit(ir_function_signature *)
1143 {
1144 unreachable("not reached");
1145 }
1146
1147 void
1148 vec4_visitor::visit(ir_function *ir)
1149 {
1150 /* Ignore function bodies other than main() -- we shouldn't see calls to
1151 * them since they should all be inlined.
1152 */
1153 if (strcmp(ir->name, "main") == 0) {
1154 const ir_function_signature *sig;
1155 exec_list empty;
1156
1157 sig = ir->matching_signature(NULL, &empty, false);
1158
1159 assert(sig);
1160
1161 visit_instructions(&sig->body);
1162 }
1163 }
1164
1165 bool
1166 vec4_visitor::try_emit_mad(ir_expression *ir)
1167 {
1168 /* 3-src instructions were introduced in gen6. */
1169 if (devinfo->gen < 6)
1170 return false;
1171
1172 /* MAD can only handle floating-point data. */
1173 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1174 return false;
1175
1176 ir_rvalue *nonmul;
1177 ir_expression *mul;
1178 bool mul_negate, mul_abs;
1179
1180 for (int i = 0; i < 2; i++) {
1181 mul_negate = false;
1182 mul_abs = false;
1183
1184 mul = ir->operands[i]->as_expression();
1185 nonmul = ir->operands[1 - i];
1186
1187 if (mul && mul->operation == ir_unop_abs) {
1188 mul = mul->operands[0]->as_expression();
1189 mul_abs = true;
1190 } else if (mul && mul->operation == ir_unop_neg) {
1191 mul = mul->operands[0]->as_expression();
1192 mul_negate = true;
1193 }
1194
1195 if (mul && mul->operation == ir_binop_mul)
1196 break;
1197 }
1198
1199 if (!mul || mul->operation != ir_binop_mul)
1200 return false;
1201
1202 nonmul->accept(this);
1203 src_reg src0 = fix_3src_operand(this->result);
1204
1205 mul->operands[0]->accept(this);
1206 src_reg src1 = fix_3src_operand(this->result);
1207 src1.negate ^= mul_negate;
1208 src1.abs = mul_abs;
1209 if (mul_abs)
1210 src1.negate = false;
1211
1212 mul->operands[1]->accept(this);
1213 src_reg src2 = fix_3src_operand(this->result);
1214 src2.abs = mul_abs;
1215 if (mul_abs)
1216 src2.negate = false;
1217
1218 this->result = src_reg(this, ir->type);
1219 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1220
1221 return true;
1222 }
1223
1224 bool
1225 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1226 {
1227 /* This optimization relies on CMP setting the destination to 0 when
1228 * false. Early hardware only sets the least significant bit, and
1229 * leaves the other bits undefined. So we can't use it.
1230 */
1231 if (devinfo->gen < 6)
1232 return false;
1233
1234 ir_expression *const cmp = ir->operands[0]->as_expression();
1235
1236 if (cmp == NULL)
1237 return false;
1238
1239 switch (cmp->operation) {
1240 case ir_binop_less:
1241 case ir_binop_greater:
1242 case ir_binop_lequal:
1243 case ir_binop_gequal:
1244 case ir_binop_equal:
1245 case ir_binop_nequal:
1246 break;
1247
1248 default:
1249 return false;
1250 }
1251
1252 cmp->operands[0]->accept(this);
1253 const src_reg cmp_src0 = this->result;
1254
1255 cmp->operands[1]->accept(this);
1256 const src_reg cmp_src1 = this->result;
1257
1258 this->result = src_reg(this, ir->type);
1259
1260 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1261 brw_conditional_for_comparison(cmp->operation)));
1262
1263 /* If the comparison is false, this->result will just happen to be zero.
1264 */
1265 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1266 this->result, src_reg(1.0f));
1267 inst->predicate = BRW_PREDICATE_NORMAL;
1268 inst->predicate_inverse = true;
1269
1270 return true;
1271 }
1272
1273 vec4_instruction *
1274 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1275 src_reg src0, src_reg src1)
1276 {
1277 vec4_instruction *inst;
1278
1279 if (devinfo->gen >= 6) {
1280 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1281 inst->conditional_mod = conditionalmod;
1282 } else {
1283 emit(CMP(dst, src0, src1, conditionalmod));
1284
1285 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1286 inst->predicate = BRW_PREDICATE_NORMAL;
1287 }
1288
1289 return inst;
1290 }
1291
1292 vec4_instruction *
1293 vec4_visitor::emit_lrp(const dst_reg &dst,
1294 const src_reg &x, const src_reg &y, const src_reg &a)
1295 {
1296 if (devinfo->gen >= 6) {
1297 /* Note that the instruction's argument order is reversed from GLSL
1298 * and the IR.
1299 */
1300 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1301 fix_3src_operand(x)));
1302 } else {
1303 /* Earlier generations don't support three source operations, so we
1304 * need to emit x*(1-a) + y*a.
1305 */
1306 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1307 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1308 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1309 y_times_a.writemask = dst.writemask;
1310 one_minus_a.writemask = dst.writemask;
1311 x_times_one_minus_a.writemask = dst.writemask;
1312
1313 emit(MUL(y_times_a, y, a));
1314 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1315 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1316 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1317 }
1318 }
1319
1320 /**
1321 * Emits the instructions needed to perform a pull constant load. before_block
1322 * and before_inst can be NULL in which case the instruction will be appended
1323 * to the end of the instruction list.
1324 */
1325 void
1326 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1327 src_reg surf_index,
1328 src_reg offset_reg,
1329 bblock_t *before_block,
1330 vec4_instruction *before_inst)
1331 {
1332 assert((before_inst == NULL && before_block == NULL) ||
1333 (before_inst && before_block));
1334
1335 vec4_instruction *pull;
1336
1337 if (devinfo->gen >= 9) {
1338 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1339 src_reg header(this, glsl_type::uvec4_type, 2);
1340
1341 pull = new(mem_ctx)
1342 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1343 dst_reg(header));
1344
1345 if (before_inst)
1346 emit_before(before_block, before_inst, pull);
1347 else
1348 emit(pull);
1349
1350 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1351 offset_reg.type);
1352 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1353
1354 if (before_inst)
1355 emit_before(before_block, before_inst, pull);
1356 else
1357 emit(pull);
1358
1359 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1360 dst,
1361 surf_index,
1362 header);
1363 pull->mlen = 2;
1364 pull->header_size = 1;
1365 } else if (devinfo->gen >= 7) {
1366 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1367
1368 grf_offset.type = offset_reg.type;
1369
1370 pull = MOV(grf_offset, offset_reg);
1371
1372 if (before_inst)
1373 emit_before(before_block, before_inst, pull);
1374 else
1375 emit(pull);
1376
1377 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1378 dst,
1379 surf_index,
1380 src_reg(grf_offset));
1381 pull->mlen = 1;
1382 } else {
1383 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1384 dst,
1385 surf_index,
1386 offset_reg);
1387 pull->base_mrf = 14;
1388 pull->mlen = 1;
1389 }
1390
1391 if (before_inst)
1392 emit_before(before_block, before_inst, pull);
1393 else
1394 emit(pull);
1395 }
1396
1397 src_reg
1398 vec4_visitor::emit_uniformize(const src_reg &src)
1399 {
1400 const src_reg chan_index(this, glsl_type::uint_type);
1401 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1402 src.type);
1403
1404 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1405 ->force_writemask_all = true;
1406 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1407 ->force_writemask_all = true;
1408
1409 return src_reg(dst);
1410 }
1411
1412 void
1413 vec4_visitor::visit(ir_expression *ir)
1414 {
1415 unsigned int operand;
1416 src_reg op[ARRAY_SIZE(ir->operands)];
1417 vec4_instruction *inst;
1418
1419 if (ir->operation == ir_binop_add) {
1420 if (try_emit_mad(ir))
1421 return;
1422 }
1423
1424 if (ir->operation == ir_unop_b2f) {
1425 if (try_emit_b2f_of_compare(ir))
1426 return;
1427 }
1428
1429 /* Storage for our result. Ideally for an assignment we'd be using
1430 * the actual storage for the result here, instead.
1431 */
1432 dst_reg result_dst(this, ir->type);
1433 src_reg result_src(result_dst);
1434
1435 if (ir->operation == ir_triop_csel) {
1436 ir->operands[1]->accept(this);
1437 op[1] = this->result;
1438 ir->operands[2]->accept(this);
1439 op[2] = this->result;
1440
1441 enum brw_predicate predicate;
1442 emit_bool_to_cond_code(ir->operands[0], &predicate);
1443 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1444 inst->predicate = predicate;
1445 this->result = result_src;
1446 return;
1447 }
1448
1449 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1450 this->result.file = BAD_FILE;
1451 ir->operands[operand]->accept(this);
1452 if (this->result.file == BAD_FILE) {
1453 fprintf(stderr, "Failed to get tree for expression operand:\n");
1454 ir->operands[operand]->fprint(stderr);
1455 exit(1);
1456 }
1457 op[operand] = this->result;
1458
1459 /* Matrix expression operands should have been broken down to vector
1460 * operations already.
1461 */
1462 assert(!ir->operands[operand]->type->is_matrix());
1463 }
1464
1465 /* If nothing special happens, this is the result. */
1466 this->result = result_src;
1467
1468 switch (ir->operation) {
1469 case ir_unop_logic_not:
1470 emit(NOT(result_dst, op[0]));
1471 break;
1472 case ir_unop_neg:
1473 op[0].negate = !op[0].negate;
1474 emit(MOV(result_dst, op[0]));
1475 break;
1476 case ir_unop_abs:
1477 op[0].abs = true;
1478 op[0].negate = false;
1479 emit(MOV(result_dst, op[0]));
1480 break;
1481
1482 case ir_unop_sign:
1483 if (ir->type->is_float()) {
1484 /* AND(val, 0x80000000) gives the sign bit.
1485 *
1486 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1487 * zero.
1488 */
1489 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1490
1491 op[0].type = BRW_REGISTER_TYPE_UD;
1492 result_dst.type = BRW_REGISTER_TYPE_UD;
1493 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1494
1495 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1496 inst->predicate = BRW_PREDICATE_NORMAL;
1497
1498 this->result.type = BRW_REGISTER_TYPE_F;
1499 } else {
1500 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1501 * -> non-negative val generates 0x00000000.
1502 * Predicated OR sets 1 if val is positive.
1503 */
1504 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1505
1506 emit(ASR(result_dst, op[0], src_reg(31)));
1507
1508 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1509 inst->predicate = BRW_PREDICATE_NORMAL;
1510 }
1511 break;
1512
1513 case ir_unop_rcp:
1514 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1515 break;
1516
1517 case ir_unop_exp2:
1518 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1519 break;
1520 case ir_unop_log2:
1521 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1522 break;
1523 case ir_unop_exp:
1524 case ir_unop_log:
1525 unreachable("not reached: should be handled by ir_explog_to_explog2");
1526 case ir_unop_sin:
1527 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1528 break;
1529 case ir_unop_cos:
1530 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1531 break;
1532
1533 case ir_unop_dFdx:
1534 case ir_unop_dFdx_coarse:
1535 case ir_unop_dFdx_fine:
1536 case ir_unop_dFdy:
1537 case ir_unop_dFdy_coarse:
1538 case ir_unop_dFdy_fine:
1539 unreachable("derivatives not valid in vertex shader");
1540
1541 case ir_unop_bitfield_reverse:
1542 emit(BFREV(result_dst, op[0]));
1543 break;
1544 case ir_unop_bit_count:
1545 emit(CBIT(result_dst, op[0]));
1546 break;
1547 case ir_unop_find_msb: {
1548 src_reg temp = src_reg(this, glsl_type::uint_type);
1549
1550 inst = emit(FBH(dst_reg(temp), op[0]));
1551 inst->dst.writemask = WRITEMASK_XYZW;
1552
1553 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1554 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1555 * subtract the result from 31 to convert the MSB count into an LSB count.
1556 */
1557
1558 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1559 temp.swizzle = BRW_SWIZZLE_NOOP;
1560 emit(MOV(result_dst, temp));
1561
1562 src_reg src_tmp = src_reg(result_dst);
1563 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1564
1565 src_tmp.negate = true;
1566 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1567 inst->predicate = BRW_PREDICATE_NORMAL;
1568 break;
1569 }
1570 case ir_unop_find_lsb:
1571 emit(FBL(result_dst, op[0]));
1572 break;
1573 case ir_unop_saturate:
1574 inst = emit(MOV(result_dst, op[0]));
1575 inst->saturate = true;
1576 break;
1577
1578 case ir_unop_noise:
1579 unreachable("not reached: should be handled by lower_noise");
1580
1581 case ir_unop_subroutine_to_int:
1582 emit(MOV(result_dst, op[0]));
1583 break;
1584
1585 case ir_binop_add:
1586 emit(ADD(result_dst, op[0], op[1]));
1587 break;
1588 case ir_binop_sub:
1589 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1590
1591 case ir_binop_mul:
1592 if (devinfo->gen < 8 && ir->type->is_integer()) {
1593 /* For integer multiplication, the MUL uses the low 16 bits of one of
1594 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1595 * accumulates in the contribution of the upper 16 bits of that
1596 * operand. If we can determine that one of the args is in the low
1597 * 16 bits, though, we can just emit a single MUL.
1598 */
1599 if (ir->operands[0]->is_uint16_constant()) {
1600 if (devinfo->gen < 7)
1601 emit(MUL(result_dst, op[0], op[1]));
1602 else
1603 emit(MUL(result_dst, op[1], op[0]));
1604 } else if (ir->operands[1]->is_uint16_constant()) {
1605 if (devinfo->gen < 7)
1606 emit(MUL(result_dst, op[1], op[0]));
1607 else
1608 emit(MUL(result_dst, op[0], op[1]));
1609 } else {
1610 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1611
1612 emit(MUL(acc, op[0], op[1]));
1613 emit(MACH(dst_null_d(), op[0], op[1]));
1614 emit(MOV(result_dst, src_reg(acc)));
1615 }
1616 } else {
1617 emit(MUL(result_dst, op[0], op[1]));
1618 }
1619 break;
1620 case ir_binop_imul_high: {
1621 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1622
1623 emit(MUL(acc, op[0], op[1]));
1624 emit(MACH(result_dst, op[0], op[1]));
1625 break;
1626 }
1627 case ir_binop_div:
1628 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1629 assert(ir->type->is_integer());
1630 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1631 break;
1632
1633 case ir_binop_carry:
1634 unreachable("Should have been lowered by carry_to_arith().");
1635
1636 case ir_binop_borrow:
1637 unreachable("Should have been lowered by borrow_to_arith().");
1638
1639 case ir_binop_mod:
1640 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1641 assert(ir->type->is_integer());
1642 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1643 break;
1644
1645 case ir_binop_less:
1646 case ir_binop_greater:
1647 case ir_binop_lequal:
1648 case ir_binop_gequal:
1649 case ir_binop_equal:
1650 case ir_binop_nequal: {
1651 if (devinfo->gen <= 5) {
1652 resolve_bool_comparison(ir->operands[0], &op[0]);
1653 resolve_bool_comparison(ir->operands[1], &op[1]);
1654 }
1655 emit(CMP(result_dst, op[0], op[1],
1656 brw_conditional_for_comparison(ir->operation)));
1657 break;
1658 }
1659
1660 case ir_binop_all_equal:
1661 if (devinfo->gen <= 5) {
1662 resolve_bool_comparison(ir->operands[0], &op[0]);
1663 resolve_bool_comparison(ir->operands[1], &op[1]);
1664 }
1665
1666 /* "==" operator producing a scalar boolean. */
1667 if (ir->operands[0]->type->is_vector() ||
1668 ir->operands[1]->type->is_vector()) {
1669 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1670 emit(MOV(result_dst, src_reg(0)));
1671 inst = emit(MOV(result_dst, src_reg(~0)));
1672 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1673 } else {
1674 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1675 }
1676 break;
1677 case ir_binop_any_nequal:
1678 if (devinfo->gen <= 5) {
1679 resolve_bool_comparison(ir->operands[0], &op[0]);
1680 resolve_bool_comparison(ir->operands[1], &op[1]);
1681 }
1682
1683 /* "!=" operator producing a scalar boolean. */
1684 if (ir->operands[0]->type->is_vector() ||
1685 ir->operands[1]->type->is_vector()) {
1686 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1687
1688 emit(MOV(result_dst, src_reg(0)));
1689 inst = emit(MOV(result_dst, src_reg(~0)));
1690 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1691 } else {
1692 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1693 }
1694 break;
1695
1696 case ir_unop_any:
1697 if (devinfo->gen <= 5) {
1698 resolve_bool_comparison(ir->operands[0], &op[0]);
1699 }
1700 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1701 emit(MOV(result_dst, src_reg(0)));
1702
1703 inst = emit(MOV(result_dst, src_reg(~0)));
1704 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1705 break;
1706
1707 case ir_binop_logic_xor:
1708 emit(XOR(result_dst, op[0], op[1]));
1709 break;
1710
1711 case ir_binop_logic_or:
1712 emit(OR(result_dst, op[0], op[1]));
1713 break;
1714
1715 case ir_binop_logic_and:
1716 emit(AND(result_dst, op[0], op[1]));
1717 break;
1718
1719 case ir_binop_dot:
1720 assert(ir->operands[0]->type->is_vector());
1721 assert(ir->operands[0]->type == ir->operands[1]->type);
1722 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1723 break;
1724
1725 case ir_unop_sqrt:
1726 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1727 break;
1728 case ir_unop_rsq:
1729 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1730 break;
1731
1732 case ir_unop_bitcast_i2f:
1733 case ir_unop_bitcast_u2f:
1734 this->result = op[0];
1735 this->result.type = BRW_REGISTER_TYPE_F;
1736 break;
1737
1738 case ir_unop_bitcast_f2i:
1739 this->result = op[0];
1740 this->result.type = BRW_REGISTER_TYPE_D;
1741 break;
1742
1743 case ir_unop_bitcast_f2u:
1744 this->result = op[0];
1745 this->result.type = BRW_REGISTER_TYPE_UD;
1746 break;
1747
1748 case ir_unop_i2f:
1749 case ir_unop_i2u:
1750 case ir_unop_u2i:
1751 case ir_unop_u2f:
1752 case ir_unop_f2i:
1753 case ir_unop_f2u:
1754 emit(MOV(result_dst, op[0]));
1755 break;
1756 case ir_unop_b2i:
1757 case ir_unop_b2f:
1758 if (devinfo->gen <= 5) {
1759 resolve_bool_comparison(ir->operands[0], &op[0]);
1760 }
1761 emit(MOV(result_dst, negate(op[0])));
1762 break;
1763 case ir_unop_f2b:
1764 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1765 break;
1766 case ir_unop_i2b:
1767 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1768 break;
1769
1770 case ir_unop_trunc:
1771 emit(RNDZ(result_dst, op[0]));
1772 break;
1773 case ir_unop_ceil: {
1774 src_reg tmp = src_reg(this, ir->type);
1775 op[0].negate = !op[0].negate;
1776 emit(RNDD(dst_reg(tmp), op[0]));
1777 tmp.negate = true;
1778 emit(MOV(result_dst, tmp));
1779 }
1780 break;
1781 case ir_unop_floor:
1782 inst = emit(RNDD(result_dst, op[0]));
1783 break;
1784 case ir_unop_fract:
1785 inst = emit(FRC(result_dst, op[0]));
1786 break;
1787 case ir_unop_round_even:
1788 emit(RNDE(result_dst, op[0]));
1789 break;
1790
1791 case ir_binop_min:
1792 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1793 break;
1794 case ir_binop_max:
1795 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1796 break;
1797
1798 case ir_binop_pow:
1799 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1800 break;
1801
1802 case ir_unop_bit_not:
1803 inst = emit(NOT(result_dst, op[0]));
1804 break;
1805 case ir_binop_bit_and:
1806 inst = emit(AND(result_dst, op[0], op[1]));
1807 break;
1808 case ir_binop_bit_xor:
1809 inst = emit(XOR(result_dst, op[0], op[1]));
1810 break;
1811 case ir_binop_bit_or:
1812 inst = emit(OR(result_dst, op[0], op[1]));
1813 break;
1814
1815 case ir_binop_lshift:
1816 inst = emit(SHL(result_dst, op[0], op[1]));
1817 break;
1818
1819 case ir_binop_rshift:
1820 if (ir->type->base_type == GLSL_TYPE_INT)
1821 inst = emit(ASR(result_dst, op[0], op[1]));
1822 else
1823 inst = emit(SHR(result_dst, op[0], op[1]));
1824 break;
1825
1826 case ir_binop_bfm:
1827 emit(BFI1(result_dst, op[0], op[1]));
1828 break;
1829
1830 case ir_binop_ubo_load: {
1831 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1832 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1833 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1834 src_reg offset;
1835
1836 /* Now, load the vector from that offset. */
1837 assert(ir->type->is_vector() || ir->type->is_scalar());
1838
1839 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1840 packed_consts.type = result.type;
1841 src_reg surf_index;
1842
1843 if (const_uniform_block) {
1844 /* The block index is a constant, so just emit the binding table entry
1845 * as an immediate.
1846 */
1847 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1848 const_uniform_block->value.u[0]);
1849 } else {
1850 /* The block index is not a constant. Evaluate the index expression
1851 * per-channel and add the base UBO index; we have to select a value
1852 * from any live channel.
1853 */
1854 surf_index = src_reg(this, glsl_type::uint_type);
1855 emit(ADD(dst_reg(surf_index), op[0],
1856 src_reg(prog_data->base.binding_table.ubo_start)));
1857 surf_index = emit_uniformize(surf_index);
1858
1859 /* Assume this may touch any UBO. It would be nice to provide
1860 * a tighter bound, but the array information is already lowered away.
1861 */
1862 brw_mark_surface_used(&prog_data->base,
1863 prog_data->base.binding_table.ubo_start +
1864 shader_prog->NumUniformBlocks - 1);
1865 }
1866
1867 if (const_offset_ir) {
1868 if (devinfo->gen >= 8) {
1869 /* Store the offset in a GRF so we can send-from-GRF. */
1870 offset = src_reg(this, glsl_type::int_type);
1871 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1872 } else {
1873 /* Immediates are fine on older generations since they'll be moved
1874 * to a (potentially fake) MRF at the generator level.
1875 */
1876 offset = src_reg(const_offset / 16);
1877 }
1878 } else {
1879 offset = src_reg(this, glsl_type::uint_type);
1880 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1881 }
1882
1883 emit_pull_constant_load_reg(dst_reg(packed_consts),
1884 surf_index,
1885 offset,
1886 NULL, NULL /* before_block/inst */);
1887
1888 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1889 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1890 const_offset % 16 / 4,
1891 const_offset % 16 / 4,
1892 const_offset % 16 / 4);
1893
1894 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1895 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1896 emit(CMP(result_dst, packed_consts, src_reg(0u),
1897 BRW_CONDITIONAL_NZ));
1898 } else {
1899 emit(MOV(result_dst, packed_consts));
1900 }
1901 break;
1902 }
1903
1904 case ir_binop_vector_extract:
1905 unreachable("should have been lowered by vec_index_to_cond_assign");
1906
1907 case ir_triop_fma:
1908 op[0] = fix_3src_operand(op[0]);
1909 op[1] = fix_3src_operand(op[1]);
1910 op[2] = fix_3src_operand(op[2]);
1911 /* Note that the instruction's argument order is reversed from GLSL
1912 * and the IR.
1913 */
1914 emit(MAD(result_dst, op[2], op[1], op[0]));
1915 break;
1916
1917 case ir_triop_lrp:
1918 emit_lrp(result_dst, op[0], op[1], op[2]);
1919 break;
1920
1921 case ir_triop_csel:
1922 unreachable("already handled above");
1923 break;
1924
1925 case ir_triop_bfi:
1926 op[0] = fix_3src_operand(op[0]);
1927 op[1] = fix_3src_operand(op[1]);
1928 op[2] = fix_3src_operand(op[2]);
1929 emit(BFI2(result_dst, op[0], op[1], op[2]));
1930 break;
1931
1932 case ir_triop_bitfield_extract:
1933 op[0] = fix_3src_operand(op[0]);
1934 op[1] = fix_3src_operand(op[1]);
1935 op[2] = fix_3src_operand(op[2]);
1936 /* Note that the instruction's argument order is reversed from GLSL
1937 * and the IR.
1938 */
1939 emit(BFE(result_dst, op[2], op[1], op[0]));
1940 break;
1941
1942 case ir_triop_vector_insert:
1943 unreachable("should have been lowered by lower_vector_insert");
1944
1945 case ir_quadop_bitfield_insert:
1946 unreachable("not reached: should be handled by "
1947 "bitfield_insert_to_bfm_bfi\n");
1948
1949 case ir_quadop_vector:
1950 unreachable("not reached: should be handled by lower_quadop_vector");
1951
1952 case ir_unop_pack_half_2x16:
1953 emit_pack_half_2x16(result_dst, op[0]);
1954 break;
1955 case ir_unop_unpack_half_2x16:
1956 emit_unpack_half_2x16(result_dst, op[0]);
1957 break;
1958 case ir_unop_unpack_unorm_4x8:
1959 emit_unpack_unorm_4x8(result_dst, op[0]);
1960 break;
1961 case ir_unop_unpack_snorm_4x8:
1962 emit_unpack_snorm_4x8(result_dst, op[0]);
1963 break;
1964 case ir_unop_pack_unorm_4x8:
1965 emit_pack_unorm_4x8(result_dst, op[0]);
1966 break;
1967 case ir_unop_pack_snorm_4x8:
1968 emit_pack_snorm_4x8(result_dst, op[0]);
1969 break;
1970 case ir_unop_pack_snorm_2x16:
1971 case ir_unop_pack_unorm_2x16:
1972 case ir_unop_unpack_snorm_2x16:
1973 case ir_unop_unpack_unorm_2x16:
1974 unreachable("not reached: should be handled by lower_packing_builtins");
1975 case ir_unop_unpack_half_2x16_split_x:
1976 case ir_unop_unpack_half_2x16_split_y:
1977 case ir_binop_pack_half_2x16_split:
1978 case ir_unop_interpolate_at_centroid:
1979 case ir_binop_interpolate_at_sample:
1980 case ir_binop_interpolate_at_offset:
1981 unreachable("not reached: should not occur in vertex shader");
1982 case ir_binop_ldexp:
1983 unreachable("not reached: should be handled by ldexp_to_arith()");
1984 case ir_unop_d2f:
1985 case ir_unop_f2d:
1986 case ir_unop_d2i:
1987 case ir_unop_i2d:
1988 case ir_unop_d2u:
1989 case ir_unop_u2d:
1990 case ir_unop_d2b:
1991 case ir_unop_pack_double_2x32:
1992 case ir_unop_unpack_double_2x32:
1993 case ir_unop_frexp_sig:
1994 case ir_unop_frexp_exp:
1995 unreachable("fp64 todo");
1996 }
1997 }
1998
1999
2000 void
2001 vec4_visitor::visit(ir_swizzle *ir)
2002 {
2003 /* Note that this is only swizzles in expressions, not those on the left
2004 * hand side of an assignment, which do write masking. See ir_assignment
2005 * for that.
2006 */
2007 const unsigned swz = brw_compose_swizzle(
2008 brw_swizzle_for_size(ir->type->vector_elements),
2009 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2010
2011 ir->val->accept(this);
2012 this->result = swizzle(this->result, swz);
2013 }
2014
2015 void
2016 vec4_visitor::visit(ir_dereference_variable *ir)
2017 {
2018 const struct glsl_type *type = ir->type;
2019 dst_reg *reg = variable_storage(ir->var);
2020
2021 if (!reg) {
2022 fail("Failed to find variable storage for %s\n", ir->var->name);
2023 this->result = src_reg(brw_null_reg());
2024 return;
2025 }
2026
2027 this->result = src_reg(*reg);
2028
2029 /* System values get their swizzle from the dst_reg writemask */
2030 if (ir->var->data.mode == ir_var_system_value)
2031 return;
2032
2033 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2034 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2035 }
2036
2037
2038 int
2039 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2040 {
2041 /* Under normal circumstances array elements are stored consecutively, so
2042 * the stride is equal to the size of the array element.
2043 */
2044 return type_size_vec4(ir->type);
2045 }
2046
2047
2048 void
2049 vec4_visitor::visit(ir_dereference_array *ir)
2050 {
2051 ir_constant *constant_index;
2052 src_reg src;
2053 int array_stride = compute_array_stride(ir);
2054
2055 constant_index = ir->array_index->constant_expression_value();
2056
2057 ir->array->accept(this);
2058 src = this->result;
2059
2060 if (constant_index) {
2061 src.reg_offset += constant_index->value.i[0] * array_stride;
2062 } else {
2063 /* Variable index array dereference. It eats the "vec4" of the
2064 * base of the array and an index that offsets the Mesa register
2065 * index.
2066 */
2067 ir->array_index->accept(this);
2068
2069 src_reg index_reg;
2070
2071 if (array_stride == 1) {
2072 index_reg = this->result;
2073 } else {
2074 index_reg = src_reg(this, glsl_type::int_type);
2075
2076 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2077 }
2078
2079 if (src.reladdr) {
2080 src_reg temp = src_reg(this, glsl_type::int_type);
2081
2082 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2083
2084 index_reg = temp;
2085 }
2086
2087 src.reladdr = ralloc(mem_ctx, src_reg);
2088 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2089 }
2090
2091 /* If the type is smaller than a vec4, replicate the last channel out. */
2092 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2093 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2094 else
2095 src.swizzle = BRW_SWIZZLE_NOOP;
2096 src.type = brw_type_for_base_type(ir->type);
2097
2098 this->result = src;
2099 }
2100
2101 void
2102 vec4_visitor::visit(ir_dereference_record *ir)
2103 {
2104 unsigned int i;
2105 const glsl_type *struct_type = ir->record->type;
2106 int offset = 0;
2107
2108 ir->record->accept(this);
2109
2110 for (i = 0; i < struct_type->length; i++) {
2111 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2112 break;
2113 offset += type_size_vec4(struct_type->fields.structure[i].type);
2114 }
2115
2116 /* If the type is smaller than a vec4, replicate the last channel out. */
2117 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2118 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2119 else
2120 this->result.swizzle = BRW_SWIZZLE_NOOP;
2121 this->result.type = brw_type_for_base_type(ir->type);
2122
2123 this->result.reg_offset += offset;
2124 }
2125
2126 /**
2127 * We want to be careful in assignment setup to hit the actual storage
2128 * instead of potentially using a temporary like we might with the
2129 * ir_dereference handler.
2130 */
2131 static dst_reg
2132 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2133 {
2134 /* The LHS must be a dereference. If the LHS is a variable indexed array
2135 * access of a vector, it must be separated into a series conditional moves
2136 * before reaching this point (see ir_vec_index_to_cond_assign).
2137 */
2138 assert(ir->as_dereference());
2139 ir_dereference_array *deref_array = ir->as_dereference_array();
2140 if (deref_array) {
2141 assert(!deref_array->array->type->is_vector());
2142 }
2143
2144 /* Use the rvalue deref handler for the most part. We'll ignore
2145 * swizzles in it and write swizzles using writemask, though.
2146 */
2147 ir->accept(v);
2148 return dst_reg(v->result);
2149 }
2150
2151 void
2152 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2153 const struct glsl_type *type,
2154 enum brw_predicate predicate)
2155 {
2156 if (type->base_type == GLSL_TYPE_STRUCT) {
2157 for (unsigned int i = 0; i < type->length; i++) {
2158 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2159 }
2160 return;
2161 }
2162
2163 if (type->is_array()) {
2164 for (unsigned int i = 0; i < type->length; i++) {
2165 emit_block_move(dst, src, type->fields.array, predicate);
2166 }
2167 return;
2168 }
2169
2170 if (type->is_matrix()) {
2171 const struct glsl_type *vec_type;
2172
2173 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2174 type->vector_elements, 1);
2175
2176 for (int i = 0; i < type->matrix_columns; i++) {
2177 emit_block_move(dst, src, vec_type, predicate);
2178 }
2179 return;
2180 }
2181
2182 assert(type->is_scalar() || type->is_vector());
2183
2184 dst->type = brw_type_for_base_type(type);
2185 src->type = dst->type;
2186
2187 dst->writemask = (1 << type->vector_elements) - 1;
2188
2189 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2190
2191 vec4_instruction *inst = emit(MOV(*dst, *src));
2192 inst->predicate = predicate;
2193
2194 dst->reg_offset++;
2195 src->reg_offset++;
2196 }
2197
2198
2199 /* If the RHS processing resulted in an instruction generating a
2200 * temporary value, and it would be easy to rewrite the instruction to
2201 * generate its result right into the LHS instead, do so. This ends
2202 * up reliably removing instructions where it can be tricky to do so
2203 * later without real UD chain information.
2204 */
2205 bool
2206 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2207 dst_reg dst,
2208 src_reg src,
2209 vec4_instruction *pre_rhs_inst,
2210 vec4_instruction *last_rhs_inst)
2211 {
2212 /* This could be supported, but it would take more smarts. */
2213 if (ir->condition)
2214 return false;
2215
2216 if (pre_rhs_inst == last_rhs_inst)
2217 return false; /* No instructions generated to work with. */
2218
2219 /* Make sure the last instruction generated our source reg. */
2220 if (src.file != GRF ||
2221 src.file != last_rhs_inst->dst.file ||
2222 src.reg != last_rhs_inst->dst.reg ||
2223 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2224 src.reladdr ||
2225 src.abs ||
2226 src.negate ||
2227 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2228 return false;
2229
2230 /* Check that that last instruction fully initialized the channels
2231 * we want to use, in the order we want to use them. We could
2232 * potentially reswizzle the operands of many instructions so that
2233 * we could handle out of order channels, but don't yet.
2234 */
2235
2236 for (unsigned i = 0; i < 4; i++) {
2237 if (dst.writemask & (1 << i)) {
2238 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2239 return false;
2240
2241 if (BRW_GET_SWZ(src.swizzle, i) != i)
2242 return false;
2243 }
2244 }
2245
2246 /* Success! Rewrite the instruction. */
2247 last_rhs_inst->dst.file = dst.file;
2248 last_rhs_inst->dst.reg = dst.reg;
2249 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2250 last_rhs_inst->dst.reladdr = dst.reladdr;
2251 last_rhs_inst->dst.writemask &= dst.writemask;
2252
2253 return true;
2254 }
2255
2256 void
2257 vec4_visitor::visit(ir_assignment *ir)
2258 {
2259 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2260 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2261
2262 if (!ir->lhs->type->is_scalar() &&
2263 !ir->lhs->type->is_vector()) {
2264 ir->rhs->accept(this);
2265 src_reg src = this->result;
2266
2267 if (ir->condition) {
2268 emit_bool_to_cond_code(ir->condition, &predicate);
2269 }
2270
2271 /* emit_block_move doesn't account for swizzles in the source register.
2272 * This should be ok, since the source register is a structure or an
2273 * array, and those can't be swizzled. But double-check to be sure.
2274 */
2275 assert(src.swizzle ==
2276 (ir->rhs->type->is_matrix()
2277 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2278 : BRW_SWIZZLE_NOOP));
2279
2280 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2281 return;
2282 }
2283
2284 /* Now we're down to just a scalar/vector with writemasks. */
2285 int i;
2286
2287 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2288 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2289
2290 ir->rhs->accept(this);
2291
2292 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2293
2294 int swizzles[4];
2295 int src_chan = 0;
2296
2297 assert(ir->lhs->type->is_vector() ||
2298 ir->lhs->type->is_scalar());
2299 dst.writemask = ir->write_mask;
2300
2301 /* Swizzle a small RHS vector into the channels being written.
2302 *
2303 * glsl ir treats write_mask as dictating how many channels are
2304 * present on the RHS while in our instructions we need to make
2305 * those channels appear in the slots of the vec4 they're written to.
2306 */
2307 for (int i = 0; i < 4; i++)
2308 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2309
2310 src_reg src = swizzle(this->result,
2311 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2312 swizzles[2], swizzles[3]));
2313
2314 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2315 return;
2316 }
2317
2318 if (ir->condition) {
2319 emit_bool_to_cond_code(ir->condition, &predicate);
2320 }
2321
2322 for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2323 vec4_instruction *inst = emit(MOV(dst, src));
2324 inst->predicate = predicate;
2325
2326 dst.reg_offset++;
2327 src.reg_offset++;
2328 }
2329 }
2330
2331 void
2332 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2333 {
2334 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2335 foreach_in_list(ir_constant, field_value, &ir->components) {
2336 emit_constant_values(dst, field_value);
2337 }
2338 return;
2339 }
2340
2341 if (ir->type->is_array()) {
2342 for (unsigned int i = 0; i < ir->type->length; i++) {
2343 emit_constant_values(dst, ir->array_elements[i]);
2344 }
2345 return;
2346 }
2347
2348 if (ir->type->is_matrix()) {
2349 for (int i = 0; i < ir->type->matrix_columns; i++) {
2350 float *vec = &ir->value.f[i * ir->type->vector_elements];
2351
2352 for (int j = 0; j < ir->type->vector_elements; j++) {
2353 dst->writemask = 1 << j;
2354 dst->type = BRW_REGISTER_TYPE_F;
2355
2356 emit(MOV(*dst, src_reg(vec[j])));
2357 }
2358 dst->reg_offset++;
2359 }
2360 return;
2361 }
2362
2363 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2364
2365 for (int i = 0; i < ir->type->vector_elements; i++) {
2366 if (!(remaining_writemask & (1 << i)))
2367 continue;
2368
2369 dst->writemask = 1 << i;
2370 dst->type = brw_type_for_base_type(ir->type);
2371
2372 /* Find other components that match the one we're about to
2373 * write. Emits fewer instructions for things like vec4(0.5,
2374 * 1.5, 1.5, 1.5).
2375 */
2376 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2377 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2378 if (ir->value.b[i] == ir->value.b[j])
2379 dst->writemask |= (1 << j);
2380 } else {
2381 /* u, i, and f storage all line up, so no need for a
2382 * switch case for comparing each type.
2383 */
2384 if (ir->value.u[i] == ir->value.u[j])
2385 dst->writemask |= (1 << j);
2386 }
2387 }
2388
2389 switch (ir->type->base_type) {
2390 case GLSL_TYPE_FLOAT:
2391 emit(MOV(*dst, src_reg(ir->value.f[i])));
2392 break;
2393 case GLSL_TYPE_INT:
2394 emit(MOV(*dst, src_reg(ir->value.i[i])));
2395 break;
2396 case GLSL_TYPE_UINT:
2397 emit(MOV(*dst, src_reg(ir->value.u[i])));
2398 break;
2399 case GLSL_TYPE_BOOL:
2400 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2401 break;
2402 default:
2403 unreachable("Non-float/uint/int/bool constant");
2404 }
2405
2406 remaining_writemask &= ~dst->writemask;
2407 }
2408 dst->reg_offset++;
2409 }
2410
2411 void
2412 vec4_visitor::visit(ir_constant *ir)
2413 {
2414 dst_reg dst = dst_reg(this, ir->type);
2415 this->result = src_reg(dst);
2416
2417 emit_constant_values(&dst, ir);
2418 }
2419
2420 void
2421 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2422 {
2423 ir_dereference *deref = static_cast<ir_dereference *>(
2424 ir->actual_parameters.get_head());
2425 ir_variable *location = deref->variable_referenced();
2426 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2427 location->data.binding);
2428
2429 /* Calculate the surface offset */
2430 src_reg offset(this, glsl_type::uint_type);
2431 ir_dereference_array *deref_array = deref->as_dereference_array();
2432 if (deref_array) {
2433 deref_array->array_index->accept(this);
2434
2435 src_reg tmp(this, glsl_type::uint_type);
2436 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2437 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2438 } else {
2439 offset = location->data.atomic.offset;
2440 }
2441
2442 /* Emit the appropriate machine instruction */
2443 const char *callee = ir->callee->function_name();
2444 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2445
2446 if (!strcmp("__intrinsic_atomic_read", callee)) {
2447 emit_untyped_surface_read(surf_index, dst, offset);
2448
2449 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2450 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2451 src_reg(), src_reg());
2452
2453 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2454 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2455 src_reg(), src_reg());
2456 }
2457
2458 brw_mark_surface_used(stage_prog_data, surf_index);
2459 }
2460
2461 void
2462 vec4_visitor::visit(ir_call *ir)
2463 {
2464 const char *callee = ir->callee->function_name();
2465
2466 if (!strcmp("__intrinsic_atomic_read", callee) ||
2467 !strcmp("__intrinsic_atomic_increment", callee) ||
2468 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2469 visit_atomic_counter_intrinsic(ir);
2470 } else {
2471 unreachable("Unsupported intrinsic.");
2472 }
2473 }
2474
2475 src_reg
2476 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2477 src_reg coordinate, src_reg sampler)
2478 {
2479 vec4_instruction *inst =
2480 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2481 dst_reg(this, glsl_type::uvec4_type));
2482 inst->base_mrf = 2;
2483 inst->src[1] = sampler;
2484
2485 int param_base;
2486
2487 if (devinfo->gen >= 9) {
2488 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2489 vec4_instruction *header_inst = new(mem_ctx)
2490 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2491 dst_reg(MRF, inst->base_mrf));
2492
2493 emit(header_inst);
2494
2495 inst->mlen = 2;
2496 inst->header_size = 1;
2497 param_base = inst->base_mrf + 1;
2498 } else {
2499 inst->mlen = 1;
2500 param_base = inst->base_mrf;
2501 }
2502
2503 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2504 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2505 int zero_mask = 0xf & ~coord_mask;
2506
2507 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2508 coordinate));
2509
2510 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2511 src_reg(0)));
2512
2513 emit(inst);
2514 return src_reg(inst->dst);
2515 }
2516
2517 bool
2518 vec4_visitor::is_high_sampler(src_reg sampler)
2519 {
2520 if (devinfo->gen < 8 && !devinfo->is_haswell)
2521 return false;
2522
2523 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2524 }
2525
2526 void
2527 vec4_visitor::emit_texture(ir_texture_opcode op,
2528 dst_reg dest,
2529 const glsl_type *dest_type,
2530 src_reg coordinate,
2531 int coord_components,
2532 src_reg shadow_comparitor,
2533 src_reg lod, src_reg lod2,
2534 src_reg sample_index,
2535 uint32_t constant_offset,
2536 src_reg offset_value,
2537 src_reg mcs,
2538 bool is_cube_array,
2539 uint32_t sampler,
2540 src_reg sampler_reg)
2541 {
2542 enum opcode opcode;
2543 switch (op) {
2544 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2545 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2546 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2547 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2548 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2549 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2550 case ir_tg4: opcode = offset_value.file != BAD_FILE
2551 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2552 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2553 case ir_txb:
2554 unreachable("TXB is not valid for vertex shaders.");
2555 case ir_lod:
2556 unreachable("LOD is not valid for vertex shaders.");
2557 default:
2558 unreachable("Unrecognized tex op");
2559 }
2560
2561 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2562 opcode, dst_reg(this, dest_type));
2563
2564 inst->offset = constant_offset;
2565
2566 /* The message header is necessary for:
2567 * - Gen4 (always)
2568 * - Gen9+ for selecting SIMD4x2
2569 * - Texel offsets
2570 * - Gather channel selection
2571 * - Sampler indices too large to fit in a 4-bit value.
2572 */
2573 inst->header_size =
2574 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2575 inst->offset != 0 || op == ir_tg4 ||
2576 is_high_sampler(sampler_reg)) ? 1 : 0;
2577 inst->base_mrf = 2;
2578 inst->mlen = inst->header_size + 1; /* always at least one */
2579 inst->dst.writemask = WRITEMASK_XYZW;
2580 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2581
2582 inst->src[1] = sampler_reg;
2583
2584 /* MRF for the first parameter */
2585 int param_base = inst->base_mrf + inst->header_size;
2586
2587 if (op == ir_txs || op == ir_query_levels) {
2588 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2589 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2590 } else {
2591 /* Load the coordinate */
2592 /* FINISHME: gl_clamp_mask and saturate */
2593 int coord_mask = (1 << coord_components) - 1;
2594 int zero_mask = 0xf & ~coord_mask;
2595
2596 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2597 coordinate));
2598
2599 if (zero_mask != 0) {
2600 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2601 src_reg(0)));
2602 }
2603 /* Load the shadow comparitor */
2604 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2605 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2606 WRITEMASK_X),
2607 shadow_comparitor));
2608 inst->mlen++;
2609 }
2610
2611 /* Load the LOD info */
2612 if (op == ir_tex || op == ir_txl) {
2613 int mrf, writemask;
2614 if (devinfo->gen >= 5) {
2615 mrf = param_base + 1;
2616 if (shadow_comparitor.file != BAD_FILE) {
2617 writemask = WRITEMASK_Y;
2618 /* mlen already incremented */
2619 } else {
2620 writemask = WRITEMASK_X;
2621 inst->mlen++;
2622 }
2623 } else /* devinfo->gen == 4 */ {
2624 mrf = param_base;
2625 writemask = WRITEMASK_W;
2626 }
2627 lod.swizzle = BRW_SWIZZLE_XXXX;
2628 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2629 } else if (op == ir_txf) {
2630 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2631 } else if (op == ir_txf_ms) {
2632 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2633 sample_index));
2634 if (devinfo->gen >= 7) {
2635 /* MCS data is in the first channel of `mcs`, but we need to get it into
2636 * the .y channel of the second vec4 of params, so replicate .x across
2637 * the whole vec4 and then mask off everything except .y
2638 */
2639 mcs.swizzle = BRW_SWIZZLE_XXXX;
2640 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2641 mcs));
2642 }
2643 inst->mlen++;
2644 } else if (op == ir_txd) {
2645 const brw_reg_type type = lod.type;
2646
2647 if (devinfo->gen >= 5) {
2648 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2649 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2650 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2651 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2652 inst->mlen++;
2653
2654 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2655 lod.swizzle = BRW_SWIZZLE_ZZZZ;
2656 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2657 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2658 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2659 inst->mlen++;
2660
2661 if (shadow_comparitor.file != BAD_FILE) {
2662 emit(MOV(dst_reg(MRF, param_base + 2,
2663 shadow_comparitor.type, WRITEMASK_Z),
2664 shadow_comparitor));
2665 }
2666 }
2667 } else /* devinfo->gen == 4 */ {
2668 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2669 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2670 inst->mlen += 2;
2671 }
2672 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2673 if (shadow_comparitor.file != BAD_FILE) {
2674 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2675 shadow_comparitor));
2676 }
2677
2678 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2679 offset_value));
2680 inst->mlen++;
2681 }
2682 }
2683
2684 emit(inst);
2685
2686 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2687 * spec requires layers.
2688 */
2689 if (op == ir_txs && is_cube_array) {
2690 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2691 writemask(inst->dst, WRITEMASK_Z),
2692 src_reg(inst->dst), src_reg(6));
2693 }
2694
2695 if (devinfo->gen == 6 && op == ir_tg4) {
2696 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2697 }
2698
2699 swizzle_result(op, dest,
2700 src_reg(inst->dst), sampler, dest_type);
2701 }
2702
2703 void
2704 vec4_visitor::visit(ir_texture *ir)
2705 {
2706 uint32_t sampler =
2707 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2708
2709 ir_rvalue *nonconst_sampler_index =
2710 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2711
2712 /* Handle non-constant sampler array indexing */
2713 src_reg sampler_reg;
2714 if (nonconst_sampler_index) {
2715 /* The highest sampler which may be used by this operation is
2716 * the last element of the array. Mark it here, because the generator
2717 * doesn't have enough information to determine the bound.
2718 */
2719 uint32_t array_size = ir->sampler->as_dereference_array()
2720 ->array->type->array_size();
2721
2722 uint32_t max_used = sampler + array_size - 1;
2723 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2724 max_used += prog_data->base.binding_table.gather_texture_start;
2725 } else {
2726 max_used += prog_data->base.binding_table.texture_start;
2727 }
2728
2729 brw_mark_surface_used(&prog_data->base, max_used);
2730
2731 /* Emit code to evaluate the actual indexing expression */
2732 nonconst_sampler_index->accept(this);
2733 src_reg temp(this, glsl_type::uint_type);
2734 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2735 sampler_reg = emit_uniformize(temp);
2736 } else {
2737 /* Single sampler, or constant array index; the indexing expression
2738 * is just an immediate.
2739 */
2740 sampler_reg = src_reg(sampler);
2741 }
2742
2743 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2744 * emitting anything other than setting up the constant result.
2745 */
2746 if (ir->op == ir_tg4) {
2747 ir_constant *chan = ir->lod_info.component->as_constant();
2748 int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2749 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2750 dst_reg result(this, ir->type);
2751 this->result = src_reg(result);
2752 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2753 return;
2754 }
2755 }
2756
2757 /* Should be lowered by do_lower_texture_projection */
2758 assert(!ir->projector);
2759
2760 /* Should be lowered */
2761 assert(!ir->offset || !ir->offset->type->is_array());
2762
2763 /* Generate code to compute all the subexpression trees. This has to be
2764 * done before loading any values into MRFs for the sampler message since
2765 * generating these values may involve SEND messages that need the MRFs.
2766 */
2767 src_reg coordinate;
2768 int coord_components = 0;
2769 if (ir->coordinate) {
2770 coord_components = ir->coordinate->type->vector_elements;
2771 ir->coordinate->accept(this);
2772 coordinate = this->result;
2773 }
2774
2775 src_reg shadow_comparitor;
2776 if (ir->shadow_comparitor) {
2777 ir->shadow_comparitor->accept(this);
2778 shadow_comparitor = this->result;
2779 }
2780
2781 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2782 src_reg offset_value;
2783 if (has_nonconstant_offset) {
2784 ir->offset->accept(this);
2785 offset_value = src_reg(this->result);
2786 }
2787
2788 src_reg lod, lod2, sample_index, mcs;
2789 switch (ir->op) {
2790 case ir_tex:
2791 lod = src_reg(0.0f);
2792 break;
2793 case ir_txf:
2794 case ir_txl:
2795 case ir_txs:
2796 ir->lod_info.lod->accept(this);
2797 lod = this->result;
2798 break;
2799 case ir_query_levels:
2800 lod = src_reg(0);
2801 break;
2802 case ir_txf_ms:
2803 ir->lod_info.sample_index->accept(this);
2804 sample_index = this->result;
2805
2806 if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2807 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2808 else
2809 mcs = src_reg(0u);
2810 break;
2811 case ir_txd:
2812 ir->lod_info.grad.dPdx->accept(this);
2813 lod = this->result;
2814
2815 ir->lod_info.grad.dPdy->accept(this);
2816 lod2 = this->result;
2817 break;
2818 case ir_txb:
2819 case ir_lod:
2820 case ir_tg4:
2821 break;
2822 }
2823
2824 uint32_t constant_offset = 0;
2825 if (ir->offset != NULL && !has_nonconstant_offset) {
2826 constant_offset =
2827 brw_texture_offset(ir->offset->as_constant()->value.i,
2828 ir->offset->type->vector_elements);
2829 }
2830
2831 /* Stuff the channel select bits in the top of the texture offset */
2832 if (ir->op == ir_tg4)
2833 constant_offset |=
2834 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2835 sampler) << 16;
2836
2837 glsl_type const *type = ir->sampler->type;
2838 bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2839 type->sampler_array;
2840
2841 this->result = src_reg(this, ir->type);
2842 dst_reg dest = dst_reg(this->result);
2843
2844 emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2845 shadow_comparitor,
2846 lod, lod2, sample_index,
2847 constant_offset, offset_value,
2848 mcs, is_cube_array, sampler, sampler_reg);
2849 }
2850
2851 /**
2852 * Apply workarounds for Gen6 gather with UINT/SINT
2853 */
2854 void
2855 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2856 {
2857 if (!wa)
2858 return;
2859
2860 int width = (wa & WA_8BIT) ? 8 : 16;
2861 dst_reg dst_f = dst;
2862 dst_f.type = BRW_REGISTER_TYPE_F;
2863
2864 /* Convert from UNORM to UINT */
2865 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2866 emit(MOV(dst, src_reg(dst_f)));
2867
2868 if (wa & WA_SIGN) {
2869 /* Reinterpret the UINT value as a signed INT value by
2870 * shifting the sign bit into place, then shifting back
2871 * preserving sign.
2872 */
2873 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2874 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2875 }
2876 }
2877
2878 /**
2879 * Set up the gather channel based on the swizzle, for gather4.
2880 */
2881 uint32_t
2882 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2883 {
2884 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2885 switch (swiz) {
2886 case SWIZZLE_X: return 0;
2887 case SWIZZLE_Y:
2888 /* gather4 sampler is broken for green channel on RG32F --
2889 * we must ask for blue instead.
2890 */
2891 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2892 return 2;
2893 return 1;
2894 case SWIZZLE_Z: return 2;
2895 case SWIZZLE_W: return 3;
2896 default:
2897 unreachable("Not reached"); /* zero, one swizzles handled already */
2898 }
2899 }
2900
2901 void
2902 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2903 src_reg orig_val, uint32_t sampler,
2904 const glsl_type *dest_type)
2905 {
2906 int s = key_tex->swizzles[sampler];
2907
2908 dst_reg swizzled_result = dest;
2909
2910 if (op == ir_query_levels) {
2911 /* # levels is in .w */
2912 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2913 emit(MOV(swizzled_result, orig_val));
2914 return;
2915 }
2916
2917 if (op == ir_txs || dest_type == glsl_type::float_type
2918 || s == SWIZZLE_NOOP || op == ir_tg4) {
2919 emit(MOV(swizzled_result, orig_val));
2920 return;
2921 }
2922
2923
2924 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2925 int swizzle[4] = {0};
2926
2927 for (int i = 0; i < 4; i++) {
2928 switch (GET_SWZ(s, i)) {
2929 case SWIZZLE_ZERO:
2930 zero_mask |= (1 << i);
2931 break;
2932 case SWIZZLE_ONE:
2933 one_mask |= (1 << i);
2934 break;
2935 default:
2936 copy_mask |= (1 << i);
2937 swizzle[i] = GET_SWZ(s, i);
2938 break;
2939 }
2940 }
2941
2942 if (copy_mask) {
2943 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2944 swizzled_result.writemask = copy_mask;
2945 emit(MOV(swizzled_result, orig_val));
2946 }
2947
2948 if (zero_mask) {
2949 swizzled_result.writemask = zero_mask;
2950 emit(MOV(swizzled_result, src_reg(0.0f)));
2951 }
2952
2953 if (one_mask) {
2954 swizzled_result.writemask = one_mask;
2955 emit(MOV(swizzled_result, src_reg(1.0f)));
2956 }
2957 }
2958
2959 void
2960 vec4_visitor::visit(ir_return *)
2961 {
2962 unreachable("not reached");
2963 }
2964
2965 void
2966 vec4_visitor::visit(ir_discard *)
2967 {
2968 unreachable("not reached");
2969 }
2970
2971 void
2972 vec4_visitor::visit(ir_if *ir)
2973 {
2974 /* Don't point the annotation at the if statement, because then it plus
2975 * the then and else blocks get printed.
2976 */
2977 this->base_ir = ir->condition;
2978
2979 if (devinfo->gen == 6) {
2980 emit_if_gen6(ir);
2981 } else {
2982 enum brw_predicate predicate;
2983 emit_bool_to_cond_code(ir->condition, &predicate);
2984 emit(IF(predicate));
2985 }
2986
2987 visit_instructions(&ir->then_instructions);
2988
2989 if (!ir->else_instructions.is_empty()) {
2990 this->base_ir = ir->condition;
2991 emit(BRW_OPCODE_ELSE);
2992
2993 visit_instructions(&ir->else_instructions);
2994 }
2995
2996 this->base_ir = ir->condition;
2997 emit(BRW_OPCODE_ENDIF);
2998 }
2999
3000 void
3001 vec4_visitor::gs_emit_vertex(int stream_id)
3002 {
3003 unreachable("not reached");
3004 }
3005
3006 void
3007 vec4_visitor::visit(ir_emit_vertex *)
3008 {
3009 unreachable("not reached");
3010 }
3011
3012 void
3013 vec4_visitor::gs_end_primitive()
3014 {
3015 unreachable("not reached");
3016 }
3017
3018
3019 void
3020 vec4_visitor::visit(ir_end_primitive *)
3021 {
3022 unreachable("not reached");
3023 }
3024
3025 void
3026 vec4_visitor::visit(ir_barrier *)
3027 {
3028 unreachable("not reached");
3029 }
3030
3031 void
3032 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3033 dst_reg dst, src_reg offset,
3034 src_reg src0, src_reg src1)
3035 {
3036 unsigned mlen = 0;
3037
3038 /* Set the atomic operation offset. */
3039 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3040 mlen++;
3041
3042 /* Set the atomic operation arguments. */
3043 if (src0.file != BAD_FILE) {
3044 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3045 mlen++;
3046 }
3047
3048 if (src1.file != BAD_FILE) {
3049 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3050 mlen++;
3051 }
3052
3053 /* Emit the instruction. Note that this maps to the normal SIMD8
3054 * untyped atomic message on Ivy Bridge, but that's OK because
3055 * unused channels will be masked out.
3056 */
3057 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3058 brw_message_reg(0),
3059 src_reg(surf_index), src_reg(atomic_op));
3060 inst->mlen = mlen;
3061 }
3062
3063 void
3064 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3065 src_reg offset)
3066 {
3067 /* Set the surface read offset. */
3068 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3069
3070 /* Emit the instruction. Note that this maps to the normal SIMD8
3071 * untyped surface read message, but that's OK because unused
3072 * channels will be masked out.
3073 */
3074 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3075 brw_message_reg(0),
3076 src_reg(surf_index), src_reg(1));
3077 inst->mlen = 1;
3078 }
3079
3080 void
3081 vec4_visitor::emit_ndc_computation()
3082 {
3083 /* Get the position */
3084 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3085
3086 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3087 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3088 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3089
3090 current_annotation = "NDC";
3091 dst_reg ndc_w = ndc;
3092 ndc_w.writemask = WRITEMASK_W;
3093 src_reg pos_w = pos;
3094 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3095 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3096
3097 dst_reg ndc_xyz = ndc;
3098 ndc_xyz.writemask = WRITEMASK_XYZ;
3099
3100 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3101 }
3102
3103 void
3104 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3105 {
3106 if (devinfo->gen < 6 &&
3107 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3108 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3109 devinfo->has_negative_rhw_bug)) {
3110 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3111 dst_reg header1_w = header1;
3112 header1_w.writemask = WRITEMASK_W;
3113
3114 emit(MOV(header1, 0u));
3115
3116 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3117 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3118
3119 current_annotation = "Point size";
3120 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3121 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3122 }
3123
3124 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3125 current_annotation = "Clipping flags";
3126 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3127 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3128
3129 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3130 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3131 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3132
3133 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3134 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3135 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3136 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3137 }
3138
3139 /* i965 clipping workaround:
3140 * 1) Test for -ve rhw
3141 * 2) If set,
3142 * set ndc = (0,0,0,0)
3143 * set ucp[6] = 1
3144 *
3145 * Later, clipping will detect ucp[6] and ensure the primitive is
3146 * clipped against all fixed planes.
3147 */
3148 if (devinfo->has_negative_rhw_bug) {
3149 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3150 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3151 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3152 vec4_instruction *inst;
3153 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3154 inst->predicate = BRW_PREDICATE_NORMAL;
3155 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3156 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3157 inst->predicate = BRW_PREDICATE_NORMAL;
3158 }
3159
3160 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3161 } else if (devinfo->gen < 6) {
3162 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3163 } else {
3164 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3165 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3166 dst_reg reg_w = reg;
3167 reg_w.writemask = WRITEMASK_W;
3168 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3169 reg_as_src.type = reg_w.type;
3170 reg_as_src.swizzle = brw_swizzle_for_size(1);
3171 emit(MOV(reg_w, reg_as_src));
3172 }
3173 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3174 dst_reg reg_y = reg;
3175 reg_y.writemask = WRITEMASK_Y;
3176 reg_y.type = BRW_REGISTER_TYPE_D;
3177 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3178 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3179 }
3180 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3181 dst_reg reg_z = reg;
3182 reg_z.writemask = WRITEMASK_Z;
3183 reg_z.type = BRW_REGISTER_TYPE_D;
3184 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3185 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3186 }
3187 }
3188 }
3189
3190 vec4_instruction *
3191 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3192 {
3193 assert(varying < VARYING_SLOT_MAX);
3194 assert(output_reg[varying].type == reg.type);
3195 current_annotation = output_reg_annotation[varying];
3196 /* Copy the register, saturating if necessary */
3197 return emit(MOV(reg, src_reg(output_reg[varying])));
3198 }
3199
3200 void
3201 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3202 {
3203 reg.type = BRW_REGISTER_TYPE_F;
3204 output_reg[varying].type = reg.type;
3205
3206 switch (varying) {
3207 case VARYING_SLOT_PSIZ:
3208 {
3209 /* PSIZ is always in slot 0, and is coupled with other flags. */
3210 current_annotation = "indices, point width, clip flags";
3211 emit_psiz_and_flags(reg);
3212 break;
3213 }
3214 case BRW_VARYING_SLOT_NDC:
3215 current_annotation = "NDC";
3216 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3217 break;
3218 case VARYING_SLOT_POS:
3219 current_annotation = "gl_Position";
3220 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3221 break;
3222 case VARYING_SLOT_EDGE:
3223 /* This is present when doing unfilled polygons. We're supposed to copy
3224 * the edge flag from the user-provided vertex array
3225 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3226 * of that attribute (starts as 1.0f). This is then used in clipping to
3227 * determine which edges should be drawn as wireframe.
3228 */
3229 current_annotation = "edge flag";
3230 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3231 glsl_type::float_type, WRITEMASK_XYZW))));
3232 break;
3233 case BRW_VARYING_SLOT_PAD:
3234 /* No need to write to this slot */
3235 break;
3236 case VARYING_SLOT_COL0:
3237 case VARYING_SLOT_COL1:
3238 case VARYING_SLOT_BFC0:
3239 case VARYING_SLOT_BFC1: {
3240 /* These built-in varyings are only supported in compatibility mode,
3241 * and we only support GS in core profile. So, this must be a vertex
3242 * shader.
3243 */
3244 assert(stage == MESA_SHADER_VERTEX);
3245 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3246 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3247 inst->saturate = true;
3248 break;
3249 }
3250
3251 default:
3252 emit_generic_urb_slot(reg, varying);
3253 break;
3254 }
3255 }
3256
3257 static int
3258 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3259 {
3260 if (devinfo->gen >= 6) {
3261 /* URB data written (does not include the message header reg) must
3262 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3263 * section 5.4.3.2.2: URB_INTERLEAVED.
3264 *
3265 * URB entries are allocated on a multiple of 1024 bits, so an
3266 * extra 128 bits written here to make the end align to 256 is
3267 * no problem.
3268 */
3269 if ((mlen % 2) != 1)
3270 mlen++;
3271 }
3272
3273 return mlen;
3274 }
3275
3276
3277 /**
3278 * Generates the VUE payload plus the necessary URB write instructions to
3279 * output it.
3280 *
3281 * The VUE layout is documented in Volume 2a.
3282 */
3283 void
3284 vec4_visitor::emit_vertex()
3285 {
3286 /* MRF 0 is reserved for the debugger, so start with message header
3287 * in MRF 1.
3288 */
3289 int base_mrf = 1;
3290 int mrf = base_mrf;
3291 /* In the process of generating our URB write message contents, we
3292 * may need to unspill a register or load from an array. Those
3293 * reads would use MRFs 14-15.
3294 */
3295 int max_usable_mrf = 13;
3296
3297 /* The following assertion verifies that max_usable_mrf causes an
3298 * even-numbered amount of URB write data, which will meet gen6's
3299 * requirements for length alignment.
3300 */
3301 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3302
3303 /* First mrf is the g0-based message header containing URB handles and
3304 * such.
3305 */
3306 emit_urb_write_header(mrf++);
3307
3308 if (devinfo->gen < 6) {
3309 emit_ndc_computation();
3310 }
3311
3312 /* We may need to split this up into several URB writes, so do them in a
3313 * loop.
3314 */
3315 int slot = 0;
3316 bool complete = false;
3317 do {
3318 /* URB offset is in URB row increments, and each of our MRFs is half of
3319 * one of those, since we're doing interleaved writes.
3320 */
3321 int offset = slot / 2;
3322
3323 mrf = base_mrf + 1;
3324 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3325 emit_urb_slot(dst_reg(MRF, mrf++),
3326 prog_data->vue_map.slot_to_varying[slot]);
3327
3328 /* If this was max_usable_mrf, we can't fit anything more into this
3329 * URB WRITE.
3330 */
3331 if (mrf > max_usable_mrf) {
3332 slot++;
3333 break;
3334 }
3335 }
3336
3337 complete = slot >= prog_data->vue_map.num_slots;
3338 current_annotation = "URB write";
3339 vec4_instruction *inst = emit_urb_write_opcode(complete);
3340 inst->base_mrf = base_mrf;
3341 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3342 inst->offset += offset;
3343 } while(!complete);
3344 }
3345
3346
3347 src_reg
3348 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3349 src_reg *reladdr, int reg_offset)
3350 {
3351 /* Because we store the values to scratch interleaved like our
3352 * vertex data, we need to scale the vec4 index by 2.
3353 */
3354 int message_header_scale = 2;
3355
3356 /* Pre-gen6, the message header uses byte offsets instead of vec4
3357 * (16-byte) offset units.
3358 */
3359 if (devinfo->gen < 6)
3360 message_header_scale *= 16;
3361
3362 if (reladdr) {
3363 src_reg index = src_reg(this, glsl_type::int_type);
3364
3365 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3366 src_reg(reg_offset)));
3367 emit_before(block, inst, MUL(dst_reg(index), index,
3368 src_reg(message_header_scale)));
3369
3370 return index;
3371 } else {
3372 return src_reg(reg_offset * message_header_scale);
3373 }
3374 }
3375
3376 src_reg
3377 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3378 src_reg *reladdr, int reg_offset)
3379 {
3380 if (reladdr) {
3381 src_reg index = src_reg(this, glsl_type::int_type);
3382
3383 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3384 src_reg(reg_offset)));
3385
3386 /* Pre-gen6, the message header uses byte offsets instead of vec4
3387 * (16-byte) offset units.
3388 */
3389 if (devinfo->gen < 6) {
3390 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3391 }
3392
3393 return index;
3394 } else if (devinfo->gen >= 8) {
3395 /* Store the offset in a GRF so we can send-from-GRF. */
3396 src_reg offset = src_reg(this, glsl_type::int_type);
3397 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3398 return offset;
3399 } else {
3400 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3401 return src_reg(reg_offset * message_header_scale);
3402 }
3403 }
3404
3405 /**
3406 * Emits an instruction before @inst to load the value named by @orig_src
3407 * from scratch space at @base_offset to @temp.
3408 *
3409 * @base_offset is measured in 32-byte units (the size of a register).
3410 */
3411 void
3412 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3413 dst_reg temp, src_reg orig_src,
3414 int base_offset)
3415 {
3416 int reg_offset = base_offset + orig_src.reg_offset;
3417 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3418 reg_offset);
3419
3420 emit_before(block, inst, SCRATCH_READ(temp, index));
3421 }
3422
3423 /**
3424 * Emits an instruction after @inst to store the value to be written
3425 * to @orig_dst to scratch space at @base_offset, from @temp.
3426 *
3427 * @base_offset is measured in 32-byte units (the size of a register).
3428 */
3429 void
3430 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3431 int base_offset)
3432 {
3433 int reg_offset = base_offset + inst->dst.reg_offset;
3434 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3435 reg_offset);
3436
3437 /* Create a temporary register to store *inst's result in.
3438 *
3439 * We have to be careful in MOVing from our temporary result register in
3440 * the scratch write. If we swizzle from channels of the temporary that
3441 * weren't initialized, it will confuse live interval analysis, which will
3442 * make spilling fail to make progress.
3443 */
3444 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3445 inst->dst.type),
3446 brw_swizzle_for_mask(inst->dst.writemask));
3447 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3448 inst->dst.writemask));
3449 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3450 if (inst->opcode != BRW_OPCODE_SEL)
3451 write->predicate = inst->predicate;
3452 write->ir = inst->ir;
3453 write->annotation = inst->annotation;
3454 inst->insert_after(block, write);
3455
3456 inst->dst.file = temp.file;
3457 inst->dst.reg = temp.reg;
3458 inst->dst.reg_offset = temp.reg_offset;
3459 inst->dst.reladdr = NULL;
3460 }
3461
3462 /**
3463 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3464 * adds the scratch read(s) before \p inst. The function also checks for
3465 * recursive reladdr scratch accesses, issuing the corresponding scratch
3466 * loads and rewriting reladdr references accordingly.
3467 *
3468 * \return \p src if it did not require a scratch load, otherwise, the
3469 * register holding the result of the scratch load that the caller should
3470 * use to rewrite src.
3471 */
3472 src_reg
3473 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3474 vec4_instruction *inst, src_reg src)
3475 {
3476 /* Resolve recursive reladdr scratch access by calling ourselves
3477 * with src.reladdr
3478 */
3479 if (src.reladdr)
3480 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3481 *src.reladdr);
3482
3483 /* Now handle scratch access on src */
3484 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3485 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3486 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3487 src.reg = temp.reg;
3488 src.reg_offset = temp.reg_offset;
3489 src.reladdr = NULL;
3490 }
3491
3492 return src;
3493 }
3494
3495 /**
3496 * We can't generally support array access in GRF space, because a
3497 * single instruction's destination can only span 2 contiguous
3498 * registers. So, we send all GRF arrays that get variable index
3499 * access to scratch space.
3500 */
3501 void
3502 vec4_visitor::move_grf_array_access_to_scratch()
3503 {
3504 int scratch_loc[this->alloc.count];
3505 memset(scratch_loc, -1, sizeof(scratch_loc));
3506
3507 /* First, calculate the set of virtual GRFs that need to be punted
3508 * to scratch due to having any array access on them, and where in
3509 * scratch.
3510 */
3511 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3512 if (inst->dst.file == GRF && inst->dst.reladdr) {
3513 if (scratch_loc[inst->dst.reg] == -1) {
3514 scratch_loc[inst->dst.reg] = last_scratch;
3515 last_scratch += this->alloc.sizes[inst->dst.reg];
3516 }
3517
3518 for (src_reg *iter = inst->dst.reladdr;
3519 iter->reladdr;
3520 iter = iter->reladdr) {
3521 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3522 scratch_loc[iter->reg] = last_scratch;
3523 last_scratch += this->alloc.sizes[iter->reg];
3524 }
3525 }
3526 }
3527
3528 for (int i = 0 ; i < 3; i++) {
3529 for (src_reg *iter = &inst->src[i];
3530 iter->reladdr;
3531 iter = iter->reladdr) {
3532 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3533 scratch_loc[iter->reg] = last_scratch;
3534 last_scratch += this->alloc.sizes[iter->reg];
3535 }
3536 }
3537 }
3538 }
3539
3540 /* Now, for anything that will be accessed through scratch, rewrite
3541 * it to load/store. Note that this is a _safe list walk, because
3542 * we may generate a new scratch_write instruction after the one
3543 * we're processing.
3544 */
3545 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3546 /* Set up the annotation tracking for new generated instructions. */
3547 base_ir = inst->ir;
3548 current_annotation = inst->annotation;
3549
3550 /* First handle scratch access on the dst. Notice we have to handle
3551 * the case where the dst's reladdr also points to scratch space.
3552 */
3553 if (inst->dst.reladdr)
3554 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3555 *inst->dst.reladdr);
3556
3557 /* Now that we have handled any (possibly recursive) reladdr scratch
3558 * accesses for dst we can safely do the scratch write for dst itself
3559 */
3560 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3561 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3562
3563 /* Now handle scratch access on any src. In this case, since inst->src[i]
3564 * already is a src_reg, we can just call emit_resolve_reladdr with
3565 * inst->src[i] and it will take care of handling scratch loads for
3566 * both src and src.reladdr (recursively).
3567 */
3568 for (int i = 0 ; i < 3; i++) {
3569 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3570 inst->src[i]);
3571 }
3572 }
3573 }
3574
3575 /**
3576 * Emits an instruction before @inst to load the value named by @orig_src
3577 * from the pull constant buffer (surface) at @base_offset to @temp.
3578 */
3579 void
3580 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3581 dst_reg temp, src_reg orig_src,
3582 int base_offset)
3583 {
3584 int reg_offset = base_offset + orig_src.reg_offset;
3585 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3586 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3587 reg_offset);
3588
3589 emit_pull_constant_load_reg(temp,
3590 index,
3591 offset,
3592 block, inst);
3593 }
3594
3595 /**
3596 * Implements array access of uniforms by inserting a
3597 * PULL_CONSTANT_LOAD instruction.
3598 *
3599 * Unlike temporary GRF array access (where we don't support it due to
3600 * the difficulty of doing relative addressing on instruction
3601 * destinations), we could potentially do array access of uniforms
3602 * that were loaded in GRF space as push constants. In real-world
3603 * usage we've seen, though, the arrays being used are always larger
3604 * than we could load as push constants, so just always move all
3605 * uniform array access out to a pull constant buffer.
3606 */
3607 void
3608 vec4_visitor::move_uniform_array_access_to_pull_constants()
3609 {
3610 int pull_constant_loc[this->uniforms];
3611 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3612 bool nested_reladdr;
3613
3614 /* Walk through and find array access of uniforms. Put a copy of that
3615 * uniform in the pull constant buffer.
3616 *
3617 * Note that we don't move constant-indexed accesses to arrays. No
3618 * testing has been done of the performance impact of this choice.
3619 */
3620 do {
3621 nested_reladdr = false;
3622
3623 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3624 for (int i = 0 ; i < 3; i++) {
3625 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3626 continue;
3627
3628 int uniform = inst->src[i].reg;
3629
3630 if (inst->src[i].reladdr->reladdr)
3631 nested_reladdr = true; /* will need another pass */
3632
3633 /* If this array isn't already present in the pull constant buffer,
3634 * add it.
3635 */
3636 if (pull_constant_loc[uniform] == -1) {
3637 const gl_constant_value **values =
3638 &stage_prog_data->param[uniform * 4];
3639
3640 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3641
3642 assert(uniform < uniform_array_size);
3643 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3644 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3645 = values[j];
3646 }
3647 }
3648
3649 /* Set up the annotation tracking for new generated instructions. */
3650 base_ir = inst->ir;
3651 current_annotation = inst->annotation;
3652
3653 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3654
3655 emit_pull_constant_load(block, inst, temp, inst->src[i],
3656 pull_constant_loc[uniform]);
3657
3658 inst->src[i].file = temp.file;
3659 inst->src[i].reg = temp.reg;
3660 inst->src[i].reg_offset = temp.reg_offset;
3661 inst->src[i].reladdr = NULL;
3662 }
3663 }
3664 } while (nested_reladdr);
3665
3666 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3667 * no need to track them as larger-than-vec4 objects. This will be
3668 * relied on in cutting out unused uniform vectors from push
3669 * constants.
3670 */
3671 split_uniform_registers();
3672 }
3673
3674 void
3675 vec4_visitor::resolve_ud_negate(src_reg *reg)
3676 {
3677 if (reg->type != BRW_REGISTER_TYPE_UD ||
3678 !reg->negate)
3679 return;
3680
3681 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3682 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3683 *reg = temp;
3684 }
3685
3686 /**
3687 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3688 *
3689 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3690 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3691 */
3692 void
3693 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3694 {
3695 assert(devinfo->gen <= 5);
3696
3697 if (!rvalue->type->is_boolean())
3698 return;
3699
3700 src_reg and_result = src_reg(this, rvalue->type);
3701 src_reg neg_result = src_reg(this, rvalue->type);
3702 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3703 emit(MOV(dst_reg(neg_result), negate(and_result)));
3704 *reg = neg_result;
3705 }
3706
3707 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3708 void *log_data,
3709 struct gl_program *prog,
3710 const struct brw_vue_prog_key *key,
3711 struct brw_vue_prog_data *prog_data,
3712 struct gl_shader_program *shader_prog,
3713 gl_shader_stage stage,
3714 void *mem_ctx,
3715 bool no_spills,
3716 int shader_time_index)
3717 : backend_shader(compiler, log_data, mem_ctx,
3718 shader_prog, prog, &prog_data->base, stage),
3719 key(key),
3720 key_tex(&key->tex),
3721 prog_data(prog_data),
3722 sanity_param_count(0),
3723 fail_msg(NULL),
3724 first_non_payload_grf(0),
3725 need_all_constants_in_pull_buffer(false),
3726 no_spills(no_spills),
3727 shader_time_index(shader_time_index),
3728 last_scratch(0)
3729 {
3730 this->failed = false;
3731
3732 this->base_ir = NULL;
3733 this->current_annotation = NULL;
3734 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3735
3736 this->variable_ht = hash_table_ctor(0,
3737 hash_table_pointer_hash,
3738 hash_table_pointer_compare);
3739
3740 this->virtual_grf_start = NULL;
3741 this->virtual_grf_end = NULL;
3742 this->live_intervals = NULL;
3743
3744 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3745
3746 this->uniforms = 0;
3747
3748 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3749 * at least one. See setup_uniforms() in brw_vec4.cpp.
3750 */
3751 this->uniform_array_size = 1;
3752 if (prog_data) {
3753 this->uniform_array_size =
3754 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3755 }
3756
3757 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3758 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3759 }
3760
3761 vec4_visitor::~vec4_visitor()
3762 {
3763 hash_table_dtor(this->variable_ht);
3764 }
3765
3766
3767 void
3768 vec4_visitor::fail(const char *format, ...)
3769 {
3770 va_list va;
3771 char *msg;
3772
3773 if (failed)
3774 return;
3775
3776 failed = true;
3777
3778 va_start(va, format);
3779 msg = ralloc_vasprintf(mem_ctx, format, va);
3780 va_end(va);
3781 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3782
3783 this->fail_msg = msg;
3784
3785 if (debug_enabled) {
3786 fprintf(stderr, "%s", msg);
3787 }
3788 }
3789
3790 } /* namespace brw */