i965/vec4: Print "VS" or "GS" when compiles fail, not "vec4".
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(brw->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (brw->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (brw->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (brw->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(MOV(f, src_reg(shifted)));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(MOV(f, src_reg(shifted)));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_ERROR:
616 case GLSL_TYPE_INTERFACE:
617 unreachable("not reached");
618 }
619
620 return 0;
621 }
622
623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
624 {
625 init();
626
627 this->file = GRF;
628 this->reg = v->alloc.allocate(type_size(type));
629
630 if (type->is_array() || type->is_record()) {
631 this->swizzle = BRW_SWIZZLE_NOOP;
632 } else {
633 this->swizzle = swizzle_for_size(type->vector_elements);
634 }
635
636 this->type = brw_type_for_base_type(type);
637 }
638
639 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
640 {
641 assert(size > 0);
642
643 init();
644
645 this->file = GRF;
646 this->reg = v->alloc.allocate(type_size(type) * size);
647
648 this->swizzle = BRW_SWIZZLE_NOOP;
649
650 this->type = brw_type_for_base_type(type);
651 }
652
653 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
654 {
655 init();
656
657 this->file = GRF;
658 this->reg = v->alloc.allocate(type_size(type));
659
660 if (type->is_array() || type->is_record()) {
661 this->writemask = WRITEMASK_XYZW;
662 } else {
663 this->writemask = (1 << type->vector_elements) - 1;
664 }
665
666 this->type = brw_type_for_base_type(type);
667 }
668
669 /* Our support for uniforms is piggy-backed on the struct
670 * gl_fragment_program, because that's where the values actually
671 * get stored, rather than in some global gl_shader_program uniform
672 * store.
673 */
674 void
675 vec4_visitor::setup_uniform_values(ir_variable *ir)
676 {
677 int namelen = strlen(ir->name);
678
679 /* The data for our (non-builtin) uniforms is stored in a series of
680 * gl_uniform_driver_storage structs for each subcomponent that
681 * glGetUniformLocation() could name. We know it's been set up in the same
682 * order we'd walk the type, so walk the list of storage and find anything
683 * with our name, or the prefix of a component that starts with our name.
684 */
685 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
686 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
687
688 if (strncmp(ir->name, storage->name, namelen) != 0 ||
689 (storage->name[namelen] != 0 &&
690 storage->name[namelen] != '.' &&
691 storage->name[namelen] != '[')) {
692 continue;
693 }
694
695 gl_constant_value *components = storage->storage;
696 unsigned vector_count = (MAX2(storage->array_elements, 1) *
697 storage->type->matrix_columns);
698
699 for (unsigned s = 0; s < vector_count; s++) {
700 assert(uniforms < uniform_array_size);
701 uniform_vector_size[uniforms] = storage->type->vector_elements;
702
703 int i;
704 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
705 stage_prog_data->param[uniforms * 4 + i] = components;
706 components++;
707 }
708 for (; i < 4; i++) {
709 static gl_constant_value zero = { 0.0 };
710 stage_prog_data->param[uniforms * 4 + i] = &zero;
711 }
712
713 uniforms++;
714 }
715 }
716 }
717
718 void
719 vec4_visitor::setup_uniform_clipplane_values()
720 {
721 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
722
723 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
724 assert(this->uniforms < uniform_array_size);
725 this->uniform_vector_size[this->uniforms] = 4;
726 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
727 this->userplane[i].type = BRW_REGISTER_TYPE_F;
728 for (int j = 0; j < 4; ++j) {
729 stage_prog_data->param[this->uniforms * 4 + j] =
730 (gl_constant_value *) &clip_planes[i][j];
731 }
732 ++this->uniforms;
733 }
734 }
735
736 /* Our support for builtin uniforms is even scarier than non-builtin.
737 * It sits on top of the PROG_STATE_VAR parameters that are
738 * automatically updated from GL context state.
739 */
740 void
741 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
742 {
743 const ir_state_slot *const slots = ir->get_state_slots();
744 assert(slots != NULL);
745
746 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
747 /* This state reference has already been setup by ir_to_mesa,
748 * but we'll get the same index back here. We can reference
749 * ParameterValues directly, since unlike brw_fs.cpp, we never
750 * add new state references during compile.
751 */
752 int index = _mesa_add_state_reference(this->prog->Parameters,
753 (gl_state_index *)slots[i].tokens);
754 gl_constant_value *values =
755 &this->prog->Parameters->ParameterValues[index][0];
756
757 assert(this->uniforms < uniform_array_size);
758 this->uniform_vector_size[this->uniforms] = 0;
759 /* Add each of the unique swizzled channels of the element.
760 * This will end up matching the size of the glsl_type of this field.
761 */
762 int last_swiz = -1;
763 for (unsigned int j = 0; j < 4; j++) {
764 int swiz = GET_SWZ(slots[i].swizzle, j);
765 last_swiz = swiz;
766
767 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
768 assert(this->uniforms < uniform_array_size);
769 if (swiz <= last_swiz)
770 this->uniform_vector_size[this->uniforms]++;
771 }
772 this->uniforms++;
773 }
774 }
775
776 dst_reg *
777 vec4_visitor::variable_storage(ir_variable *var)
778 {
779 return (dst_reg *)hash_table_find(this->variable_ht, var);
780 }
781
782 void
783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
784 enum brw_predicate *predicate)
785 {
786 ir_expression *expr = ir->as_expression();
787
788 *predicate = BRW_PREDICATE_NORMAL;
789
790 if (expr && expr->operation != ir_binop_ubo_load) {
791 src_reg op[3];
792 vec4_instruction *inst;
793
794 assert(expr->get_num_operands() <= 3);
795 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
796 expr->operands[i]->accept(this);
797 op[i] = this->result;
798
799 resolve_ud_negate(&op[i]);
800 }
801
802 switch (expr->operation) {
803 case ir_unop_logic_not:
804 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
805 inst->conditional_mod = BRW_CONDITIONAL_Z;
806 break;
807
808 case ir_binop_logic_xor:
809 if (brw->gen <= 5) {
810 src_reg temp = src_reg(this, ir->type);
811 emit(XOR(dst_reg(temp), op[0], op[1]));
812 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
813 } else {
814 inst = emit(XOR(dst_null_d(), op[0], op[1]));
815 }
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 break;
818
819 case ir_binop_logic_or:
820 if (brw->gen <= 5) {
821 src_reg temp = src_reg(this, ir->type);
822 emit(OR(dst_reg(temp), op[0], op[1]));
823 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
824 } else {
825 inst = emit(OR(dst_null_d(), op[0], op[1]));
826 }
827 inst->conditional_mod = BRW_CONDITIONAL_NZ;
828 break;
829
830 case ir_binop_logic_and:
831 if (brw->gen <= 5) {
832 src_reg temp = src_reg(this, ir->type);
833 emit(AND(dst_reg(temp), op[0], op[1]));
834 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
835 } else {
836 inst = emit(AND(dst_null_d(), op[0], op[1]));
837 }
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 break;
840
841 case ir_unop_f2b:
842 if (brw->gen >= 6) {
843 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
844 } else {
845 inst = emit(MOV(dst_null_f(), op[0]));
846 inst->conditional_mod = BRW_CONDITIONAL_NZ;
847 }
848 break;
849
850 case ir_unop_i2b:
851 if (brw->gen >= 6) {
852 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
853 } else {
854 inst = emit(MOV(dst_null_d(), op[0]));
855 inst->conditional_mod = BRW_CONDITIONAL_NZ;
856 }
857 break;
858
859 case ir_binop_all_equal:
860 if (brw->gen <= 5) {
861 resolve_bool_comparison(expr->operands[0], &op[0]);
862 resolve_bool_comparison(expr->operands[1], &op[1]);
863 }
864 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
865 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
866 break;
867
868 case ir_binop_any_nequal:
869 if (brw->gen <= 5) {
870 resolve_bool_comparison(expr->operands[0], &op[0]);
871 resolve_bool_comparison(expr->operands[1], &op[1]);
872 }
873 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
874 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
875 break;
876
877 case ir_unop_any:
878 if (brw->gen <= 5) {
879 resolve_bool_comparison(expr->operands[0], &op[0]);
880 }
881 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
882 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
883 break;
884
885 case ir_binop_greater:
886 case ir_binop_gequal:
887 case ir_binop_less:
888 case ir_binop_lequal:
889 case ir_binop_equal:
890 case ir_binop_nequal:
891 if (brw->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 resolve_bool_comparison(expr->operands[1], &op[1]);
894 }
895 emit(CMP(dst_null_d(), op[0], op[1],
896 brw_conditional_for_comparison(expr->operation)));
897 break;
898
899 case ir_triop_csel: {
900 /* Expand the boolean condition into the flag register. */
901 inst = emit(MOV(dst_null_d(), op[0]));
902 inst->conditional_mod = BRW_CONDITIONAL_NZ;
903
904 /* Select which boolean to return. */
905 dst_reg temp(this, expr->operands[1]->type);
906 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
907 inst->predicate = BRW_PREDICATE_NORMAL;
908
909 /* Expand the result to a condition code. */
910 inst = emit(MOV(dst_null_d(), src_reg(temp)));
911 inst->conditional_mod = BRW_CONDITIONAL_NZ;
912 break;
913 }
914
915 default:
916 unreachable("not reached");
917 }
918 return;
919 }
920
921 ir->accept(this);
922
923 resolve_ud_negate(&this->result);
924
925 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
926 inst->conditional_mod = BRW_CONDITIONAL_NZ;
927 }
928
929 /**
930 * Emit a gen6 IF statement with the comparison folded into the IF
931 * instruction.
932 */
933 void
934 vec4_visitor::emit_if_gen6(ir_if *ir)
935 {
936 ir_expression *expr = ir->condition->as_expression();
937
938 if (expr && expr->operation != ir_binop_ubo_load) {
939 src_reg op[3];
940 dst_reg temp;
941
942 assert(expr->get_num_operands() <= 3);
943 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
944 expr->operands[i]->accept(this);
945 op[i] = this->result;
946 }
947
948 switch (expr->operation) {
949 case ir_unop_logic_not:
950 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
951 return;
952
953 case ir_binop_logic_xor:
954 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
955 return;
956
957 case ir_binop_logic_or:
958 temp = dst_reg(this, glsl_type::bool_type);
959 emit(OR(temp, op[0], op[1]));
960 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
961 return;
962
963 case ir_binop_logic_and:
964 temp = dst_reg(this, glsl_type::bool_type);
965 emit(AND(temp, op[0], op[1]));
966 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_f2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_unop_i2b:
974 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
975 return;
976
977 case ir_binop_greater:
978 case ir_binop_gequal:
979 case ir_binop_less:
980 case ir_binop_lequal:
981 case ir_binop_equal:
982 case ir_binop_nequal:
983 emit(IF(op[0], op[1],
984 brw_conditional_for_comparison(expr->operation)));
985 return;
986
987 case ir_binop_all_equal:
988 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
989 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
990 return;
991
992 case ir_binop_any_nequal:
993 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
994 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
995 return;
996
997 case ir_unop_any:
998 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
999 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000 return;
1001
1002 case ir_triop_csel: {
1003 /* Expand the boolean condition into the flag register. */
1004 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007 /* Select which boolean to return. */
1008 dst_reg temp(this, expr->operands[1]->type);
1009 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010 inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013 return;
1014 }
1015
1016 default:
1017 unreachable("not reached");
1018 }
1019 return;
1020 }
1021
1022 ir->condition->accept(this);
1023
1024 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030 dst_reg *reg = NULL;
1031
1032 if (variable_storage(ir))
1033 return;
1034
1035 switch (ir->data.mode) {
1036 case ir_var_shader_in:
1037 assert(ir->data.location != -1);
1038 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039 break;
1040
1041 case ir_var_shader_out:
1042 assert(ir->data.location != -1);
1043 reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045 for (int i = 0; i < type_size(ir->type); i++) {
1046 output_reg[ir->data.location + i] = *reg;
1047 output_reg[ir->data.location + i].reg_offset = i;
1048 output_reg[ir->data.location + i].type =
1049 brw_type_for_base_type(ir->type->get_scalar_type());
1050 output_reg_annotation[ir->data.location + i] = ir->name;
1051 }
1052 break;
1053
1054 case ir_var_auto:
1055 case ir_var_temporary:
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057 break;
1058
1059 case ir_var_uniform:
1060 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062 /* Thanks to the lower_ubo_reference pass, we will see only
1063 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064 * variables, so no need for them to be in variable_ht.
1065 *
1066 * Some uniforms, such as samplers and atomic counters, have no actual
1067 * storage, so we should ignore them.
1068 */
1069 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1070 return;
1071
1072 /* Track how big the whole uniform variable is, in case we need to put a
1073 * copy of its data into pull constants for array access.
1074 */
1075 assert(this->uniforms < uniform_array_size);
1076 this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078 if (!strncmp(ir->name, "gl_", 3)) {
1079 setup_builtin_uniform_values(ir);
1080 } else {
1081 setup_uniform_values(ir);
1082 }
1083 break;
1084
1085 case ir_var_system_value:
1086 reg = make_reg_for_system_value(ir);
1087 break;
1088
1089 default:
1090 unreachable("not reached");
1091 }
1092
1093 reg->type = brw_type_for_base_type(ir->type);
1094 hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100 /* We don't want debugging output to print the whole body of the
1101 * loop as the annotation.
1102 */
1103 this->base_ir = NULL;
1104
1105 emit(BRW_OPCODE_DO);
1106
1107 visit_instructions(&ir->body_instructions);
1108
1109 emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115 switch (ir->mode) {
1116 case ir_loop_jump::jump_break:
1117 emit(BRW_OPCODE_BREAK);
1118 break;
1119 case ir_loop_jump::jump_continue:
1120 emit(BRW_OPCODE_CONTINUE);
1121 break;
1122 }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129 unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135 /* Ignore function bodies other than main() -- we shouldn't see calls to
1136 * them since they should all be inlined.
1137 */
1138 if (strcmp(ir->name, "main") == 0) {
1139 const ir_function_signature *sig;
1140 exec_list empty;
1141
1142 sig = ir->matching_signature(NULL, &empty, false);
1143
1144 assert(sig);
1145
1146 visit_instructions(&sig->body);
1147 }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153 /* 3-src instructions were introduced in gen6. */
1154 if (brw->gen < 6)
1155 return false;
1156
1157 /* MAD can only handle floating-point data. */
1158 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159 return false;
1160
1161 ir_rvalue *nonmul = ir->operands[1];
1162 ir_expression *mul = ir->operands[0]->as_expression();
1163
1164 bool mul_negate = false, mul_abs = false;
1165 if (mul && mul->operation == ir_unop_abs) {
1166 mul = mul->operands[0]->as_expression();
1167 mul_abs = true;
1168 } else if (mul && mul->operation == ir_unop_neg) {
1169 mul = mul->operands[0]->as_expression();
1170 mul_negate = true;
1171 }
1172
1173 if (!mul || mul->operation != ir_binop_mul) {
1174 nonmul = ir->operands[0];
1175 mul = ir->operands[1]->as_expression();
1176
1177 if (mul && mul->operation == ir_unop_abs) {
1178 mul = mul->operands[0]->as_expression();
1179 mul_abs = true;
1180 } else if (mul && mul->operation == ir_unop_neg) {
1181 mul = mul->operands[0]->as_expression();
1182 mul_negate = true;
1183 }
1184
1185 if (!mul || mul->operation != ir_binop_mul)
1186 return false;
1187 }
1188
1189 nonmul->accept(this);
1190 src_reg src0 = fix_3src_operand(this->result);
1191
1192 mul->operands[0]->accept(this);
1193 src_reg src1 = fix_3src_operand(this->result);
1194 src1.negate ^= mul_negate;
1195 src1.abs = mul_abs;
1196 if (mul_abs)
1197 src1.negate = false;
1198
1199 mul->operands[1]->accept(this);
1200 src_reg src2 = fix_3src_operand(this->result);
1201 src2.abs = mul_abs;
1202 if (mul_abs)
1203 src2.negate = false;
1204
1205 this->result = src_reg(this, ir->type);
1206 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1207
1208 return true;
1209 }
1210
1211 bool
1212 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1213 {
1214 /* This optimization relies on CMP setting the destination to 0 when
1215 * false. Early hardware only sets the least significant bit, and
1216 * leaves the other bits undefined. So we can't use it.
1217 */
1218 if (brw->gen < 6)
1219 return false;
1220
1221 ir_expression *const cmp = ir->operands[0]->as_expression();
1222
1223 if (cmp == NULL)
1224 return false;
1225
1226 switch (cmp->operation) {
1227 case ir_binop_less:
1228 case ir_binop_greater:
1229 case ir_binop_lequal:
1230 case ir_binop_gequal:
1231 case ir_binop_equal:
1232 case ir_binop_nequal:
1233 break;
1234
1235 default:
1236 return false;
1237 }
1238
1239 cmp->operands[0]->accept(this);
1240 const src_reg cmp_src0 = this->result;
1241
1242 cmp->operands[1]->accept(this);
1243 const src_reg cmp_src1 = this->result;
1244
1245 this->result = src_reg(this, ir->type);
1246
1247 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1248 brw_conditional_for_comparison(cmp->operation)));
1249
1250 /* If the comparison is false, this->result will just happen to be zero.
1251 */
1252 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1253 this->result, src_reg(1.0f));
1254 inst->predicate = BRW_PREDICATE_NORMAL;
1255 inst->predicate_inverse = true;
1256
1257 return true;
1258 }
1259
1260 void
1261 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1262 src_reg src0, src_reg src1)
1263 {
1264 vec4_instruction *inst;
1265
1266 if (brw->gen >= 6) {
1267 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268 inst->conditional_mod = conditionalmod;
1269 } else {
1270 emit(CMP(dst, src0, src1, conditionalmod));
1271
1272 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1273 inst->predicate = BRW_PREDICATE_NORMAL;
1274 }
1275 }
1276
1277 void
1278 vec4_visitor::emit_lrp(const dst_reg &dst,
1279 const src_reg &x, const src_reg &y, const src_reg &a)
1280 {
1281 if (brw->gen >= 6) {
1282 /* Note that the instruction's argument order is reversed from GLSL
1283 * and the IR.
1284 */
1285 emit(LRP(dst,
1286 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1287 } else {
1288 /* Earlier generations don't support three source operations, so we
1289 * need to emit x*(1-a) + y*a.
1290 */
1291 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1292 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1293 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1294 y_times_a.writemask = dst.writemask;
1295 one_minus_a.writemask = dst.writemask;
1296 x_times_one_minus_a.writemask = dst.writemask;
1297
1298 emit(MUL(y_times_a, y, a));
1299 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1300 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1301 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1302 }
1303 }
1304
1305 void
1306 vec4_visitor::visit(ir_expression *ir)
1307 {
1308 unsigned int operand;
1309 src_reg op[Elements(ir->operands)];
1310 vec4_instruction *inst;
1311
1312 if (ir->operation == ir_binop_add) {
1313 if (try_emit_mad(ir))
1314 return;
1315 }
1316
1317 if (ir->operation == ir_unop_b2f) {
1318 if (try_emit_b2f_of_compare(ir))
1319 return;
1320 }
1321
1322 /* Storage for our result. Ideally for an assignment we'd be using
1323 * the actual storage for the result here, instead.
1324 */
1325 dst_reg result_dst(this, ir->type);
1326 src_reg result_src(result_dst);
1327
1328 if (ir->operation == ir_triop_csel) {
1329 ir->operands[1]->accept(this);
1330 op[1] = this->result;
1331 ir->operands[2]->accept(this);
1332 op[2] = this->result;
1333
1334 enum brw_predicate predicate;
1335 emit_bool_to_cond_code(ir->operands[0], &predicate);
1336 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1337 inst->predicate = predicate;
1338 this->result = result_src;
1339 return;
1340 }
1341
1342 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1343 this->result.file = BAD_FILE;
1344 ir->operands[operand]->accept(this);
1345 if (this->result.file == BAD_FILE) {
1346 fprintf(stderr, "Failed to get tree for expression operand:\n");
1347 ir->operands[operand]->fprint(stderr);
1348 exit(1);
1349 }
1350 op[operand] = this->result;
1351
1352 /* Matrix expression operands should have been broken down to vector
1353 * operations already.
1354 */
1355 assert(!ir->operands[operand]->type->is_matrix());
1356 }
1357
1358 /* If nothing special happens, this is the result. */
1359 this->result = result_src;
1360
1361 switch (ir->operation) {
1362 case ir_unop_logic_not:
1363 emit(NOT(result_dst, op[0]));
1364 break;
1365 case ir_unop_neg:
1366 op[0].negate = !op[0].negate;
1367 emit(MOV(result_dst, op[0]));
1368 break;
1369 case ir_unop_abs:
1370 op[0].abs = true;
1371 op[0].negate = false;
1372 emit(MOV(result_dst, op[0]));
1373 break;
1374
1375 case ir_unop_sign:
1376 if (ir->type->is_float()) {
1377 /* AND(val, 0x80000000) gives the sign bit.
1378 *
1379 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1380 * zero.
1381 */
1382 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1383
1384 op[0].type = BRW_REGISTER_TYPE_UD;
1385 result_dst.type = BRW_REGISTER_TYPE_UD;
1386 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1387
1388 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1389 inst->predicate = BRW_PREDICATE_NORMAL;
1390
1391 this->result.type = BRW_REGISTER_TYPE_F;
1392 } else {
1393 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1394 * -> non-negative val generates 0x00000000.
1395 * Predicated OR sets 1 if val is positive.
1396 */
1397 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1398
1399 emit(ASR(result_dst, op[0], src_reg(31)));
1400
1401 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1402 inst->predicate = BRW_PREDICATE_NORMAL;
1403 }
1404 break;
1405
1406 case ir_unop_rcp:
1407 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1408 break;
1409
1410 case ir_unop_exp2:
1411 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1412 break;
1413 case ir_unop_log2:
1414 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1415 break;
1416 case ir_unop_exp:
1417 case ir_unop_log:
1418 unreachable("not reached: should be handled by ir_explog_to_explog2");
1419 case ir_unop_sin:
1420 case ir_unop_sin_reduced:
1421 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1422 break;
1423 case ir_unop_cos:
1424 case ir_unop_cos_reduced:
1425 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1426 break;
1427
1428 case ir_unop_dFdx:
1429 case ir_unop_dFdx_coarse:
1430 case ir_unop_dFdx_fine:
1431 case ir_unop_dFdy:
1432 case ir_unop_dFdy_coarse:
1433 case ir_unop_dFdy_fine:
1434 unreachable("derivatives not valid in vertex shader");
1435
1436 case ir_unop_bitfield_reverse:
1437 emit(BFREV(result_dst, op[0]));
1438 break;
1439 case ir_unop_bit_count:
1440 emit(CBIT(result_dst, op[0]));
1441 break;
1442 case ir_unop_find_msb: {
1443 src_reg temp = src_reg(this, glsl_type::uint_type);
1444
1445 inst = emit(FBH(dst_reg(temp), op[0]));
1446 inst->dst.writemask = WRITEMASK_XYZW;
1447
1448 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1449 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1450 * subtract the result from 31 to convert the MSB count into an LSB count.
1451 */
1452
1453 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1454 temp.swizzle = BRW_SWIZZLE_NOOP;
1455 emit(MOV(result_dst, temp));
1456
1457 src_reg src_tmp = src_reg(result_dst);
1458 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1459
1460 src_tmp.negate = true;
1461 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1462 inst->predicate = BRW_PREDICATE_NORMAL;
1463 break;
1464 }
1465 case ir_unop_find_lsb:
1466 emit(FBL(result_dst, op[0]));
1467 break;
1468 case ir_unop_saturate:
1469 inst = emit(MOV(result_dst, op[0]));
1470 inst->saturate = true;
1471 break;
1472
1473 case ir_unop_noise:
1474 unreachable("not reached: should be handled by lower_noise");
1475
1476 case ir_binop_add:
1477 emit(ADD(result_dst, op[0], op[1]));
1478 break;
1479 case ir_binop_sub:
1480 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1481
1482 case ir_binop_mul:
1483 if (brw->gen < 8 && ir->type->is_integer()) {
1484 /* For integer multiplication, the MUL uses the low 16 bits of one of
1485 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1486 * accumulates in the contribution of the upper 16 bits of that
1487 * operand. If we can determine that one of the args is in the low
1488 * 16 bits, though, we can just emit a single MUL.
1489 */
1490 if (ir->operands[0]->is_uint16_constant()) {
1491 if (brw->gen < 7)
1492 emit(MUL(result_dst, op[0], op[1]));
1493 else
1494 emit(MUL(result_dst, op[1], op[0]));
1495 } else if (ir->operands[1]->is_uint16_constant()) {
1496 if (brw->gen < 7)
1497 emit(MUL(result_dst, op[1], op[0]));
1498 else
1499 emit(MUL(result_dst, op[0], op[1]));
1500 } else {
1501 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1502
1503 emit(MUL(acc, op[0], op[1]));
1504 emit(MACH(dst_null_d(), op[0], op[1]));
1505 emit(MOV(result_dst, src_reg(acc)));
1506 }
1507 } else {
1508 emit(MUL(result_dst, op[0], op[1]));
1509 }
1510 break;
1511 case ir_binop_imul_high: {
1512 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1513
1514 emit(MUL(acc, op[0], op[1]));
1515 emit(MACH(result_dst, op[0], op[1]));
1516 break;
1517 }
1518 case ir_binop_div:
1519 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1520 assert(ir->type->is_integer());
1521 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1522 break;
1523 case ir_binop_carry: {
1524 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1525
1526 emit(ADDC(dst_null_ud(), op[0], op[1]));
1527 emit(MOV(result_dst, src_reg(acc)));
1528 break;
1529 }
1530 case ir_binop_borrow: {
1531 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1532
1533 emit(SUBB(dst_null_ud(), op[0], op[1]));
1534 emit(MOV(result_dst, src_reg(acc)));
1535 break;
1536 }
1537 case ir_binop_mod:
1538 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1539 assert(ir->type->is_integer());
1540 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1541 break;
1542
1543 case ir_binop_less:
1544 case ir_binop_greater:
1545 case ir_binop_lequal:
1546 case ir_binop_gequal:
1547 case ir_binop_equal:
1548 case ir_binop_nequal: {
1549 if (brw->gen <= 5) {
1550 resolve_bool_comparison(ir->operands[0], &op[0]);
1551 resolve_bool_comparison(ir->operands[1], &op[1]);
1552 }
1553 emit(CMP(result_dst, op[0], op[1],
1554 brw_conditional_for_comparison(ir->operation)));
1555 break;
1556 }
1557
1558 case ir_binop_all_equal:
1559 /* "==" operator producing a scalar boolean. */
1560 if (ir->operands[0]->type->is_vector() ||
1561 ir->operands[1]->type->is_vector()) {
1562 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1563 emit(MOV(result_dst, src_reg(0)));
1564 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1565 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1566 } else {
1567 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1568 }
1569 break;
1570 case ir_binop_any_nequal:
1571 /* "!=" operator producing a scalar boolean. */
1572 if (ir->operands[0]->type->is_vector() ||
1573 ir->operands[1]->type->is_vector()) {
1574 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1575
1576 emit(MOV(result_dst, src_reg(0)));
1577 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1578 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1579 } else {
1580 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1581 }
1582 break;
1583
1584 case ir_unop_any:
1585 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1586 emit(MOV(result_dst, src_reg(0)));
1587
1588 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1589 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1590 break;
1591
1592 case ir_binop_logic_xor:
1593 emit(XOR(result_dst, op[0], op[1]));
1594 break;
1595
1596 case ir_binop_logic_or:
1597 emit(OR(result_dst, op[0], op[1]));
1598 break;
1599
1600 case ir_binop_logic_and:
1601 emit(AND(result_dst, op[0], op[1]));
1602 break;
1603
1604 case ir_binop_dot:
1605 assert(ir->operands[0]->type->is_vector());
1606 assert(ir->operands[0]->type == ir->operands[1]->type);
1607 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1608 break;
1609
1610 case ir_unop_sqrt:
1611 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1612 break;
1613 case ir_unop_rsq:
1614 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1615 break;
1616
1617 case ir_unop_bitcast_i2f:
1618 case ir_unop_bitcast_u2f:
1619 this->result = op[0];
1620 this->result.type = BRW_REGISTER_TYPE_F;
1621 break;
1622
1623 case ir_unop_bitcast_f2i:
1624 this->result = op[0];
1625 this->result.type = BRW_REGISTER_TYPE_D;
1626 break;
1627
1628 case ir_unop_bitcast_f2u:
1629 this->result = op[0];
1630 this->result.type = BRW_REGISTER_TYPE_UD;
1631 break;
1632
1633 case ir_unop_i2f:
1634 case ir_unop_i2u:
1635 case ir_unop_u2i:
1636 case ir_unop_u2f:
1637 case ir_unop_f2i:
1638 case ir_unop_f2u:
1639 emit(MOV(result_dst, op[0]));
1640 break;
1641 case ir_unop_b2i:
1642 emit(AND(result_dst, op[0], src_reg(1)));
1643 break;
1644 case ir_unop_b2f:
1645 if (brw->gen <= 5) {
1646 resolve_bool_comparison(ir->operands[0], &op[0]);
1647 }
1648 op[0].type = BRW_REGISTER_TYPE_D;
1649 result_dst.type = BRW_REGISTER_TYPE_D;
1650 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1651 result_dst.type = BRW_REGISTER_TYPE_F;
1652 break;
1653 case ir_unop_f2b:
1654 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1655 break;
1656 case ir_unop_i2b:
1657 emit(AND(result_dst, op[0], src_reg(1)));
1658 break;
1659
1660 case ir_unop_trunc:
1661 emit(RNDZ(result_dst, op[0]));
1662 break;
1663 case ir_unop_ceil: {
1664 src_reg tmp = src_reg(this, ir->type);
1665 op[0].negate = !op[0].negate;
1666 emit(RNDD(dst_reg(tmp), op[0]));
1667 tmp.negate = true;
1668 emit(MOV(result_dst, tmp));
1669 }
1670 break;
1671 case ir_unop_floor:
1672 inst = emit(RNDD(result_dst, op[0]));
1673 break;
1674 case ir_unop_fract:
1675 inst = emit(FRC(result_dst, op[0]));
1676 break;
1677 case ir_unop_round_even:
1678 emit(RNDE(result_dst, op[0]));
1679 break;
1680
1681 case ir_binop_min:
1682 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1683 break;
1684 case ir_binop_max:
1685 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1686 break;
1687
1688 case ir_binop_pow:
1689 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1690 break;
1691
1692 case ir_unop_bit_not:
1693 inst = emit(NOT(result_dst, op[0]));
1694 break;
1695 case ir_binop_bit_and:
1696 inst = emit(AND(result_dst, op[0], op[1]));
1697 break;
1698 case ir_binop_bit_xor:
1699 inst = emit(XOR(result_dst, op[0], op[1]));
1700 break;
1701 case ir_binop_bit_or:
1702 inst = emit(OR(result_dst, op[0], op[1]));
1703 break;
1704
1705 case ir_binop_lshift:
1706 inst = emit(SHL(result_dst, op[0], op[1]));
1707 break;
1708
1709 case ir_binop_rshift:
1710 if (ir->type->base_type == GLSL_TYPE_INT)
1711 inst = emit(ASR(result_dst, op[0], op[1]));
1712 else
1713 inst = emit(SHR(result_dst, op[0], op[1]));
1714 break;
1715
1716 case ir_binop_bfm:
1717 emit(BFI1(result_dst, op[0], op[1]));
1718 break;
1719
1720 case ir_binop_ubo_load: {
1721 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1722 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1723 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1724 src_reg offset;
1725
1726 /* Now, load the vector from that offset. */
1727 assert(ir->type->is_vector() || ir->type->is_scalar());
1728
1729 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1730 packed_consts.type = result.type;
1731 src_reg surf_index;
1732
1733 if (const_uniform_block) {
1734 /* The block index is a constant, so just emit the binding table entry
1735 * as an immediate.
1736 */
1737 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1738 const_uniform_block->value.u[0]);
1739 } else {
1740 /* The block index is not a constant. Evaluate the index expression
1741 * per-channel and add the base UBO index; the generator will select
1742 * a value from any live channel.
1743 */
1744 surf_index = src_reg(this, glsl_type::uint_type);
1745 emit(ADD(dst_reg(surf_index), op[0],
1746 src_reg(prog_data->base.binding_table.ubo_start)));
1747
1748 /* Assume this may touch any UBO. It would be nice to provide
1749 * a tighter bound, but the array information is already lowered away.
1750 */
1751 brw_mark_surface_used(&prog_data->base,
1752 prog_data->base.binding_table.ubo_start +
1753 shader_prog->NumUniformBlocks - 1);
1754 }
1755
1756 if (const_offset_ir) {
1757 if (brw->gen >= 8) {
1758 /* Store the offset in a GRF so we can send-from-GRF. */
1759 offset = src_reg(this, glsl_type::int_type);
1760 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1761 } else {
1762 /* Immediates are fine on older generations since they'll be moved
1763 * to a (potentially fake) MRF at the generator level.
1764 */
1765 offset = src_reg(const_offset / 16);
1766 }
1767 } else {
1768 offset = src_reg(this, glsl_type::uint_type);
1769 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1770 }
1771
1772 if (brw->gen >= 7) {
1773 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1774 grf_offset.type = offset.type;
1775
1776 emit(MOV(grf_offset, offset));
1777
1778 vec4_instruction *pull =
1779 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1780 dst_reg(packed_consts),
1781 surf_index,
1782 src_reg(grf_offset)));
1783 pull->mlen = 1;
1784 } else {
1785 vec4_instruction *pull =
1786 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1787 dst_reg(packed_consts),
1788 surf_index,
1789 offset));
1790 pull->base_mrf = 14;
1791 pull->mlen = 1;
1792 }
1793
1794 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1795 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1796 const_offset % 16 / 4,
1797 const_offset % 16 / 4,
1798 const_offset % 16 / 4);
1799
1800 /* UBO bools are any nonzero int. We need to convert them to use the
1801 * value of true stored in ctx->Const.UniformBooleanTrue.
1802 */
1803 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1804 emit(CMP(result_dst, packed_consts, src_reg(0u),
1805 BRW_CONDITIONAL_NZ));
1806 } else {
1807 emit(MOV(result_dst, packed_consts));
1808 }
1809 break;
1810 }
1811
1812 case ir_binop_vector_extract:
1813 unreachable("should have been lowered by vec_index_to_cond_assign");
1814
1815 case ir_triop_fma:
1816 op[0] = fix_3src_operand(op[0]);
1817 op[1] = fix_3src_operand(op[1]);
1818 op[2] = fix_3src_operand(op[2]);
1819 /* Note that the instruction's argument order is reversed from GLSL
1820 * and the IR.
1821 */
1822 emit(MAD(result_dst, op[2], op[1], op[0]));
1823 break;
1824
1825 case ir_triop_lrp:
1826 emit_lrp(result_dst, op[0], op[1], op[2]);
1827 break;
1828
1829 case ir_triop_csel:
1830 unreachable("already handled above");
1831 break;
1832
1833 case ir_triop_bfi:
1834 op[0] = fix_3src_operand(op[0]);
1835 op[1] = fix_3src_operand(op[1]);
1836 op[2] = fix_3src_operand(op[2]);
1837 emit(BFI2(result_dst, op[0], op[1], op[2]));
1838 break;
1839
1840 case ir_triop_bitfield_extract:
1841 op[0] = fix_3src_operand(op[0]);
1842 op[1] = fix_3src_operand(op[1]);
1843 op[2] = fix_3src_operand(op[2]);
1844 /* Note that the instruction's argument order is reversed from GLSL
1845 * and the IR.
1846 */
1847 emit(BFE(result_dst, op[2], op[1], op[0]));
1848 break;
1849
1850 case ir_triop_vector_insert:
1851 unreachable("should have been lowered by lower_vector_insert");
1852
1853 case ir_quadop_bitfield_insert:
1854 unreachable("not reached: should be handled by "
1855 "bitfield_insert_to_bfm_bfi\n");
1856
1857 case ir_quadop_vector:
1858 unreachable("not reached: should be handled by lower_quadop_vector");
1859
1860 case ir_unop_pack_half_2x16:
1861 emit_pack_half_2x16(result_dst, op[0]);
1862 break;
1863 case ir_unop_unpack_half_2x16:
1864 emit_unpack_half_2x16(result_dst, op[0]);
1865 break;
1866 case ir_unop_unpack_unorm_4x8:
1867 emit_unpack_unorm_4x8(result_dst, op[0]);
1868 break;
1869 case ir_unop_unpack_snorm_4x8:
1870 emit_unpack_snorm_4x8(result_dst, op[0]);
1871 break;
1872 case ir_unop_pack_unorm_4x8:
1873 emit_pack_unorm_4x8(result_dst, op[0]);
1874 break;
1875 case ir_unop_pack_snorm_4x8:
1876 emit_pack_snorm_4x8(result_dst, op[0]);
1877 break;
1878 case ir_unop_pack_snorm_2x16:
1879 case ir_unop_pack_unorm_2x16:
1880 case ir_unop_unpack_snorm_2x16:
1881 case ir_unop_unpack_unorm_2x16:
1882 unreachable("not reached: should be handled by lower_packing_builtins");
1883 case ir_unop_unpack_half_2x16_split_x:
1884 case ir_unop_unpack_half_2x16_split_y:
1885 case ir_binop_pack_half_2x16_split:
1886 case ir_unop_interpolate_at_centroid:
1887 case ir_binop_interpolate_at_sample:
1888 case ir_binop_interpolate_at_offset:
1889 unreachable("not reached: should not occur in vertex shader");
1890 case ir_binop_ldexp:
1891 unreachable("not reached: should be handled by ldexp_to_arith()");
1892 }
1893 }
1894
1895
1896 void
1897 vec4_visitor::visit(ir_swizzle *ir)
1898 {
1899 src_reg src;
1900 int i = 0;
1901 int swizzle[4];
1902
1903 /* Note that this is only swizzles in expressions, not those on the left
1904 * hand side of an assignment, which do write masking. See ir_assignment
1905 * for that.
1906 */
1907
1908 ir->val->accept(this);
1909 src = this->result;
1910 assert(src.file != BAD_FILE);
1911
1912 for (i = 0; i < ir->type->vector_elements; i++) {
1913 switch (i) {
1914 case 0:
1915 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1916 break;
1917 case 1:
1918 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1919 break;
1920 case 2:
1921 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1922 break;
1923 case 3:
1924 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1925 break;
1926 }
1927 }
1928 for (; i < 4; i++) {
1929 /* Replicate the last channel out. */
1930 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1931 }
1932
1933 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1934
1935 this->result = src;
1936 }
1937
1938 void
1939 vec4_visitor::visit(ir_dereference_variable *ir)
1940 {
1941 const struct glsl_type *type = ir->type;
1942 dst_reg *reg = variable_storage(ir->var);
1943
1944 if (!reg) {
1945 fail("Failed to find variable storage for %s\n", ir->var->name);
1946 this->result = src_reg(brw_null_reg());
1947 return;
1948 }
1949
1950 this->result = src_reg(*reg);
1951
1952 /* System values get their swizzle from the dst_reg writemask */
1953 if (ir->var->data.mode == ir_var_system_value)
1954 return;
1955
1956 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1957 this->result.swizzle = swizzle_for_size(type->vector_elements);
1958 }
1959
1960
1961 int
1962 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1963 {
1964 /* Under normal circumstances array elements are stored consecutively, so
1965 * the stride is equal to the size of the array element.
1966 */
1967 return type_size(ir->type);
1968 }
1969
1970
1971 void
1972 vec4_visitor::visit(ir_dereference_array *ir)
1973 {
1974 ir_constant *constant_index;
1975 src_reg src;
1976 int array_stride = compute_array_stride(ir);
1977
1978 constant_index = ir->array_index->constant_expression_value();
1979
1980 ir->array->accept(this);
1981 src = this->result;
1982
1983 if (constant_index) {
1984 src.reg_offset += constant_index->value.i[0] * array_stride;
1985 } else {
1986 /* Variable index array dereference. It eats the "vec4" of the
1987 * base of the array and an index that offsets the Mesa register
1988 * index.
1989 */
1990 ir->array_index->accept(this);
1991
1992 src_reg index_reg;
1993
1994 if (array_stride == 1) {
1995 index_reg = this->result;
1996 } else {
1997 index_reg = src_reg(this, glsl_type::int_type);
1998
1999 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2000 }
2001
2002 if (src.reladdr) {
2003 src_reg temp = src_reg(this, glsl_type::int_type);
2004
2005 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2006
2007 index_reg = temp;
2008 }
2009
2010 src.reladdr = ralloc(mem_ctx, src_reg);
2011 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2012 }
2013
2014 /* If the type is smaller than a vec4, replicate the last channel out. */
2015 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2016 src.swizzle = swizzle_for_size(ir->type->vector_elements);
2017 else
2018 src.swizzle = BRW_SWIZZLE_NOOP;
2019 src.type = brw_type_for_base_type(ir->type);
2020
2021 this->result = src;
2022 }
2023
2024 void
2025 vec4_visitor::visit(ir_dereference_record *ir)
2026 {
2027 unsigned int i;
2028 const glsl_type *struct_type = ir->record->type;
2029 int offset = 0;
2030
2031 ir->record->accept(this);
2032
2033 for (i = 0; i < struct_type->length; i++) {
2034 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2035 break;
2036 offset += type_size(struct_type->fields.structure[i].type);
2037 }
2038
2039 /* If the type is smaller than a vec4, replicate the last channel out. */
2040 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2041 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2042 else
2043 this->result.swizzle = BRW_SWIZZLE_NOOP;
2044 this->result.type = brw_type_for_base_type(ir->type);
2045
2046 this->result.reg_offset += offset;
2047 }
2048
2049 /**
2050 * We want to be careful in assignment setup to hit the actual storage
2051 * instead of potentially using a temporary like we might with the
2052 * ir_dereference handler.
2053 */
2054 static dst_reg
2055 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2056 {
2057 /* The LHS must be a dereference. If the LHS is a variable indexed array
2058 * access of a vector, it must be separated into a series conditional moves
2059 * before reaching this point (see ir_vec_index_to_cond_assign).
2060 */
2061 assert(ir->as_dereference());
2062 ir_dereference_array *deref_array = ir->as_dereference_array();
2063 if (deref_array) {
2064 assert(!deref_array->array->type->is_vector());
2065 }
2066
2067 /* Use the rvalue deref handler for the most part. We'll ignore
2068 * swizzles in it and write swizzles using writemask, though.
2069 */
2070 ir->accept(v);
2071 return dst_reg(v->result);
2072 }
2073
2074 void
2075 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2076 const struct glsl_type *type,
2077 enum brw_predicate predicate)
2078 {
2079 if (type->base_type == GLSL_TYPE_STRUCT) {
2080 for (unsigned int i = 0; i < type->length; i++) {
2081 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2082 }
2083 return;
2084 }
2085
2086 if (type->is_array()) {
2087 for (unsigned int i = 0; i < type->length; i++) {
2088 emit_block_move(dst, src, type->fields.array, predicate);
2089 }
2090 return;
2091 }
2092
2093 if (type->is_matrix()) {
2094 const struct glsl_type *vec_type;
2095
2096 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2097 type->vector_elements, 1);
2098
2099 for (int i = 0; i < type->matrix_columns; i++) {
2100 emit_block_move(dst, src, vec_type, predicate);
2101 }
2102 return;
2103 }
2104
2105 assert(type->is_scalar() || type->is_vector());
2106
2107 dst->type = brw_type_for_base_type(type);
2108 src->type = dst->type;
2109
2110 dst->writemask = (1 << type->vector_elements) - 1;
2111
2112 src->swizzle = swizzle_for_size(type->vector_elements);
2113
2114 vec4_instruction *inst = emit(MOV(*dst, *src));
2115 inst->predicate = predicate;
2116
2117 dst->reg_offset++;
2118 src->reg_offset++;
2119 }
2120
2121
2122 /* If the RHS processing resulted in an instruction generating a
2123 * temporary value, and it would be easy to rewrite the instruction to
2124 * generate its result right into the LHS instead, do so. This ends
2125 * up reliably removing instructions where it can be tricky to do so
2126 * later without real UD chain information.
2127 */
2128 bool
2129 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2130 dst_reg dst,
2131 src_reg src,
2132 vec4_instruction *pre_rhs_inst,
2133 vec4_instruction *last_rhs_inst)
2134 {
2135 /* This could be supported, but it would take more smarts. */
2136 if (ir->condition)
2137 return false;
2138
2139 if (pre_rhs_inst == last_rhs_inst)
2140 return false; /* No instructions generated to work with. */
2141
2142 /* Make sure the last instruction generated our source reg. */
2143 if (src.file != GRF ||
2144 src.file != last_rhs_inst->dst.file ||
2145 src.reg != last_rhs_inst->dst.reg ||
2146 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2147 src.reladdr ||
2148 src.abs ||
2149 src.negate ||
2150 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2151 return false;
2152
2153 /* Check that that last instruction fully initialized the channels
2154 * we want to use, in the order we want to use them. We could
2155 * potentially reswizzle the operands of many instructions so that
2156 * we could handle out of order channels, but don't yet.
2157 */
2158
2159 for (unsigned i = 0; i < 4; i++) {
2160 if (dst.writemask & (1 << i)) {
2161 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2162 return false;
2163
2164 if (BRW_GET_SWZ(src.swizzle, i) != i)
2165 return false;
2166 }
2167 }
2168
2169 /* Success! Rewrite the instruction. */
2170 last_rhs_inst->dst.file = dst.file;
2171 last_rhs_inst->dst.reg = dst.reg;
2172 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2173 last_rhs_inst->dst.reladdr = dst.reladdr;
2174 last_rhs_inst->dst.writemask &= dst.writemask;
2175
2176 return true;
2177 }
2178
2179 void
2180 vec4_visitor::visit(ir_assignment *ir)
2181 {
2182 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2183 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2184
2185 if (!ir->lhs->type->is_scalar() &&
2186 !ir->lhs->type->is_vector()) {
2187 ir->rhs->accept(this);
2188 src_reg src = this->result;
2189
2190 if (ir->condition) {
2191 emit_bool_to_cond_code(ir->condition, &predicate);
2192 }
2193
2194 /* emit_block_move doesn't account for swizzles in the source register.
2195 * This should be ok, since the source register is a structure or an
2196 * array, and those can't be swizzled. But double-check to be sure.
2197 */
2198 assert(src.swizzle ==
2199 (ir->rhs->type->is_matrix()
2200 ? swizzle_for_size(ir->rhs->type->vector_elements)
2201 : BRW_SWIZZLE_NOOP));
2202
2203 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2204 return;
2205 }
2206
2207 /* Now we're down to just a scalar/vector with writemasks. */
2208 int i;
2209
2210 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2211 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2212
2213 ir->rhs->accept(this);
2214
2215 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2216
2217 src_reg src = this->result;
2218
2219 int swizzles[4];
2220 int first_enabled_chan = 0;
2221 int src_chan = 0;
2222
2223 assert(ir->lhs->type->is_vector() ||
2224 ir->lhs->type->is_scalar());
2225 dst.writemask = ir->write_mask;
2226
2227 for (int i = 0; i < 4; i++) {
2228 if (dst.writemask & (1 << i)) {
2229 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2230 break;
2231 }
2232 }
2233
2234 /* Swizzle a small RHS vector into the channels being written.
2235 *
2236 * glsl ir treats write_mask as dictating how many channels are
2237 * present on the RHS while in our instructions we need to make
2238 * those channels appear in the slots of the vec4 they're written to.
2239 */
2240 for (int i = 0; i < 4; i++) {
2241 if (dst.writemask & (1 << i))
2242 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2243 else
2244 swizzles[i] = first_enabled_chan;
2245 }
2246 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2247 swizzles[2], swizzles[3]);
2248
2249 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2250 return;
2251 }
2252
2253 if (ir->condition) {
2254 emit_bool_to_cond_code(ir->condition, &predicate);
2255 }
2256
2257 for (i = 0; i < type_size(ir->lhs->type); i++) {
2258 vec4_instruction *inst = emit(MOV(dst, src));
2259 inst->predicate = predicate;
2260
2261 dst.reg_offset++;
2262 src.reg_offset++;
2263 }
2264 }
2265
2266 void
2267 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2268 {
2269 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2270 foreach_in_list(ir_constant, field_value, &ir->components) {
2271 emit_constant_values(dst, field_value);
2272 }
2273 return;
2274 }
2275
2276 if (ir->type->is_array()) {
2277 for (unsigned int i = 0; i < ir->type->length; i++) {
2278 emit_constant_values(dst, ir->array_elements[i]);
2279 }
2280 return;
2281 }
2282
2283 if (ir->type->is_matrix()) {
2284 for (int i = 0; i < ir->type->matrix_columns; i++) {
2285 float *vec = &ir->value.f[i * ir->type->vector_elements];
2286
2287 for (int j = 0; j < ir->type->vector_elements; j++) {
2288 dst->writemask = 1 << j;
2289 dst->type = BRW_REGISTER_TYPE_F;
2290
2291 emit(MOV(*dst, src_reg(vec[j])));
2292 }
2293 dst->reg_offset++;
2294 }
2295 return;
2296 }
2297
2298 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2299
2300 for (int i = 0; i < ir->type->vector_elements; i++) {
2301 if (!(remaining_writemask & (1 << i)))
2302 continue;
2303
2304 dst->writemask = 1 << i;
2305 dst->type = brw_type_for_base_type(ir->type);
2306
2307 /* Find other components that match the one we're about to
2308 * write. Emits fewer instructions for things like vec4(0.5,
2309 * 1.5, 1.5, 1.5).
2310 */
2311 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2312 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2313 if (ir->value.b[i] == ir->value.b[j])
2314 dst->writemask |= (1 << j);
2315 } else {
2316 /* u, i, and f storage all line up, so no need for a
2317 * switch case for comparing each type.
2318 */
2319 if (ir->value.u[i] == ir->value.u[j])
2320 dst->writemask |= (1 << j);
2321 }
2322 }
2323
2324 switch (ir->type->base_type) {
2325 case GLSL_TYPE_FLOAT:
2326 emit(MOV(*dst, src_reg(ir->value.f[i])));
2327 break;
2328 case GLSL_TYPE_INT:
2329 emit(MOV(*dst, src_reg(ir->value.i[i])));
2330 break;
2331 case GLSL_TYPE_UINT:
2332 emit(MOV(*dst, src_reg(ir->value.u[i])));
2333 break;
2334 case GLSL_TYPE_BOOL:
2335 emit(MOV(*dst,
2336 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2337 : 0)));
2338 break;
2339 default:
2340 unreachable("Non-float/uint/int/bool constant");
2341 }
2342
2343 remaining_writemask &= ~dst->writemask;
2344 }
2345 dst->reg_offset++;
2346 }
2347
2348 void
2349 vec4_visitor::visit(ir_constant *ir)
2350 {
2351 dst_reg dst = dst_reg(this, ir->type);
2352 this->result = src_reg(dst);
2353
2354 emit_constant_values(&dst, ir);
2355 }
2356
2357 void
2358 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2359 {
2360 ir_dereference *deref = static_cast<ir_dereference *>(
2361 ir->actual_parameters.get_head());
2362 ir_variable *location = deref->variable_referenced();
2363 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2364 location->data.binding);
2365
2366 /* Calculate the surface offset */
2367 src_reg offset(this, glsl_type::uint_type);
2368 ir_dereference_array *deref_array = deref->as_dereference_array();
2369 if (deref_array) {
2370 deref_array->array_index->accept(this);
2371
2372 src_reg tmp(this, glsl_type::uint_type);
2373 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2374 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2375 } else {
2376 offset = location->data.atomic.offset;
2377 }
2378
2379 /* Emit the appropriate machine instruction */
2380 const char *callee = ir->callee->function_name();
2381 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2382
2383 if (!strcmp("__intrinsic_atomic_read", callee)) {
2384 emit_untyped_surface_read(surf_index, dst, offset);
2385
2386 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2387 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2388 src_reg(), src_reg());
2389
2390 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2391 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2392 src_reg(), src_reg());
2393 }
2394 }
2395
2396 void
2397 vec4_visitor::visit(ir_call *ir)
2398 {
2399 const char *callee = ir->callee->function_name();
2400
2401 if (!strcmp("__intrinsic_atomic_read", callee) ||
2402 !strcmp("__intrinsic_atomic_increment", callee) ||
2403 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2404 visit_atomic_counter_intrinsic(ir);
2405 } else {
2406 unreachable("Unsupported intrinsic.");
2407 }
2408 }
2409
2410 src_reg
2411 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2412 {
2413 vec4_instruction *inst =
2414 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2415 dst_reg(this, glsl_type::uvec4_type));
2416 inst->base_mrf = 2;
2417 inst->mlen = 1;
2418 inst->src[1] = sampler;
2419
2420 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2421 int param_base = inst->base_mrf;
2422 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2423 int zero_mask = 0xf & ~coord_mask;
2424
2425 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2426 coordinate));
2427
2428 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2429 src_reg(0)));
2430
2431 emit(inst);
2432 return src_reg(inst->dst);
2433 }
2434
2435 static bool
2436 is_high_sampler(struct brw_context *brw, src_reg sampler)
2437 {
2438 if (brw->gen < 8 && !brw->is_haswell)
2439 return false;
2440
2441 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2442 }
2443
2444 void
2445 vec4_visitor::visit(ir_texture *ir)
2446 {
2447 uint32_t sampler =
2448 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2449
2450 ir_rvalue *nonconst_sampler_index =
2451 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2452
2453 /* Handle non-constant sampler array indexing */
2454 src_reg sampler_reg;
2455 if (nonconst_sampler_index) {
2456 /* The highest sampler which may be used by this operation is
2457 * the last element of the array. Mark it here, because the generator
2458 * doesn't have enough information to determine the bound.
2459 */
2460 uint32_t array_size = ir->sampler->as_dereference_array()
2461 ->array->type->array_size();
2462
2463 uint32_t max_used = sampler + array_size - 1;
2464 if (ir->op == ir_tg4 && brw->gen < 8) {
2465 max_used += prog_data->base.binding_table.gather_texture_start;
2466 } else {
2467 max_used += prog_data->base.binding_table.texture_start;
2468 }
2469
2470 brw_mark_surface_used(&prog_data->base, max_used);
2471
2472 /* Emit code to evaluate the actual indexing expression */
2473 nonconst_sampler_index->accept(this);
2474 dst_reg temp(this, glsl_type::uint_type);
2475 emit(ADD(temp, this->result, src_reg(sampler)))
2476 ->force_writemask_all = true;
2477 sampler_reg = src_reg(temp);
2478 } else {
2479 /* Single sampler, or constant array index; the indexing expression
2480 * is just an immediate.
2481 */
2482 sampler_reg = src_reg(sampler);
2483 }
2484
2485 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2486 * emitting anything other than setting up the constant result.
2487 */
2488 if (ir->op == ir_tg4) {
2489 ir_constant *chan = ir->lod_info.component->as_constant();
2490 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2491 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2492 dst_reg result(this, ir->type);
2493 this->result = src_reg(result);
2494 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2495 return;
2496 }
2497 }
2498
2499 /* Should be lowered by do_lower_texture_projection */
2500 assert(!ir->projector);
2501
2502 /* Should be lowered */
2503 assert(!ir->offset || !ir->offset->type->is_array());
2504
2505 /* Generate code to compute all the subexpression trees. This has to be
2506 * done before loading any values into MRFs for the sampler message since
2507 * generating these values may involve SEND messages that need the MRFs.
2508 */
2509 src_reg coordinate;
2510 if (ir->coordinate) {
2511 ir->coordinate->accept(this);
2512 coordinate = this->result;
2513 }
2514
2515 src_reg shadow_comparitor;
2516 if (ir->shadow_comparitor) {
2517 ir->shadow_comparitor->accept(this);
2518 shadow_comparitor = this->result;
2519 }
2520
2521 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2522 src_reg offset_value;
2523 if (has_nonconstant_offset) {
2524 ir->offset->accept(this);
2525 offset_value = src_reg(this->result);
2526 }
2527
2528 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2529 src_reg lod, dPdx, dPdy, sample_index, mcs;
2530 switch (ir->op) {
2531 case ir_tex:
2532 lod = src_reg(0.0f);
2533 lod_type = glsl_type::float_type;
2534 break;
2535 case ir_txf:
2536 case ir_txl:
2537 case ir_txs:
2538 ir->lod_info.lod->accept(this);
2539 lod = this->result;
2540 lod_type = ir->lod_info.lod->type;
2541 break;
2542 case ir_query_levels:
2543 lod = src_reg(0);
2544 lod_type = glsl_type::int_type;
2545 break;
2546 case ir_txf_ms:
2547 ir->lod_info.sample_index->accept(this);
2548 sample_index = this->result;
2549 sample_index_type = ir->lod_info.sample_index->type;
2550
2551 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2552 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2553 else
2554 mcs = src_reg(0u);
2555 break;
2556 case ir_txd:
2557 ir->lod_info.grad.dPdx->accept(this);
2558 dPdx = this->result;
2559
2560 ir->lod_info.grad.dPdy->accept(this);
2561 dPdy = this->result;
2562
2563 lod_type = ir->lod_info.grad.dPdx->type;
2564 break;
2565 case ir_txb:
2566 case ir_lod:
2567 case ir_tg4:
2568 break;
2569 }
2570
2571 enum opcode opcode;
2572 switch (ir->op) {
2573 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2574 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2575 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2576 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2577 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2578 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2579 case ir_tg4: opcode = has_nonconstant_offset
2580 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2581 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2582 case ir_txb:
2583 unreachable("TXB is not valid for vertex shaders.");
2584 case ir_lod:
2585 unreachable("LOD is not valid for vertex shaders.");
2586 default:
2587 unreachable("Unrecognized tex op");
2588 }
2589
2590 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2591 opcode, dst_reg(this, ir->type));
2592
2593 if (ir->offset != NULL && !has_nonconstant_offset) {
2594 inst->offset =
2595 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2596 ir->offset->type->vector_elements);
2597 }
2598
2599 /* Stuff the channel select bits in the top of the texture offset */
2600 if (ir->op == ir_tg4)
2601 inst->offset |= gather_channel(ir, sampler) << 16;
2602
2603 /* The message header is necessary for:
2604 * - Gen4 (always)
2605 * - Gen9+ for selecting SIMD4x2
2606 * - Texel offsets
2607 * - Gather channel selection
2608 * - Sampler indices too large to fit in a 4-bit value.
2609 */
2610 inst->header_present =
2611 brw->gen < 5 || brw->gen >= 9 ||
2612 inst->offset != 0 || ir->op == ir_tg4 ||
2613 is_high_sampler(brw, sampler_reg);
2614 inst->base_mrf = 2;
2615 inst->mlen = inst->header_present + 1; /* always at least one */
2616 inst->dst.writemask = WRITEMASK_XYZW;
2617 inst->shadow_compare = ir->shadow_comparitor != NULL;
2618
2619 inst->src[1] = sampler_reg;
2620
2621 /* MRF for the first parameter */
2622 int param_base = inst->base_mrf + inst->header_present;
2623
2624 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2625 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2626 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2627 } else {
2628 /* Load the coordinate */
2629 /* FINISHME: gl_clamp_mask and saturate */
2630 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2631 int zero_mask = 0xf & ~coord_mask;
2632
2633 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2634 coordinate));
2635
2636 if (zero_mask != 0) {
2637 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2638 src_reg(0)));
2639 }
2640 /* Load the shadow comparitor */
2641 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2642 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2643 WRITEMASK_X),
2644 shadow_comparitor));
2645 inst->mlen++;
2646 }
2647
2648 /* Load the LOD info */
2649 if (ir->op == ir_tex || ir->op == ir_txl) {
2650 int mrf, writemask;
2651 if (brw->gen >= 5) {
2652 mrf = param_base + 1;
2653 if (ir->shadow_comparitor) {
2654 writemask = WRITEMASK_Y;
2655 /* mlen already incremented */
2656 } else {
2657 writemask = WRITEMASK_X;
2658 inst->mlen++;
2659 }
2660 } else /* brw->gen == 4 */ {
2661 mrf = param_base;
2662 writemask = WRITEMASK_W;
2663 }
2664 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2665 } else if (ir->op == ir_txf) {
2666 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2667 } else if (ir->op == ir_txf_ms) {
2668 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2669 sample_index));
2670 if (brw->gen >= 7) {
2671 /* MCS data is in the first channel of `mcs`, but we need to get it into
2672 * the .y channel of the second vec4 of params, so replicate .x across
2673 * the whole vec4 and then mask off everything except .y
2674 */
2675 mcs.swizzle = BRW_SWIZZLE_XXXX;
2676 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2677 mcs));
2678 }
2679 inst->mlen++;
2680 } else if (ir->op == ir_txd) {
2681 const glsl_type *type = lod_type;
2682
2683 if (brw->gen >= 5) {
2684 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2685 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2686 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2687 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2688 inst->mlen++;
2689
2690 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2691 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2692 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2693 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2694 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2695 inst->mlen++;
2696
2697 if (ir->shadow_comparitor) {
2698 emit(MOV(dst_reg(MRF, param_base + 2,
2699 ir->shadow_comparitor->type, WRITEMASK_Z),
2700 shadow_comparitor));
2701 }
2702 }
2703 } else /* brw->gen == 4 */ {
2704 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2705 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2706 inst->mlen += 2;
2707 }
2708 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2709 if (ir->shadow_comparitor) {
2710 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2711 shadow_comparitor));
2712 }
2713
2714 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2715 offset_value));
2716 inst->mlen++;
2717 }
2718 }
2719
2720 emit(inst);
2721
2722 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2723 * spec requires layers.
2724 */
2725 if (ir->op == ir_txs) {
2726 glsl_type const *type = ir->sampler->type;
2727 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2728 type->sampler_array) {
2729 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2730 writemask(inst->dst, WRITEMASK_Z),
2731 src_reg(inst->dst), src_reg(6));
2732 }
2733 }
2734
2735 if (brw->gen == 6 && ir->op == ir_tg4) {
2736 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2737 }
2738
2739 swizzle_result(ir, src_reg(inst->dst), sampler);
2740 }
2741
2742 /**
2743 * Apply workarounds for Gen6 gather with UINT/SINT
2744 */
2745 void
2746 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2747 {
2748 if (!wa)
2749 return;
2750
2751 int width = (wa & WA_8BIT) ? 8 : 16;
2752 dst_reg dst_f = dst;
2753 dst_f.type = BRW_REGISTER_TYPE_F;
2754
2755 /* Convert from UNORM to UINT */
2756 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2757 emit(MOV(dst, src_reg(dst_f)));
2758
2759 if (wa & WA_SIGN) {
2760 /* Reinterpret the UINT value as a signed INT value by
2761 * shifting the sign bit into place, then shifting back
2762 * preserving sign.
2763 */
2764 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2765 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2766 }
2767 }
2768
2769 /**
2770 * Set up the gather channel based on the swizzle, for gather4.
2771 */
2772 uint32_t
2773 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2774 {
2775 ir_constant *chan = ir->lod_info.component->as_constant();
2776 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2777 switch (swiz) {
2778 case SWIZZLE_X: return 0;
2779 case SWIZZLE_Y:
2780 /* gather4 sampler is broken for green channel on RG32F --
2781 * we must ask for blue instead.
2782 */
2783 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2784 return 2;
2785 return 1;
2786 case SWIZZLE_Z: return 2;
2787 case SWIZZLE_W: return 3;
2788 default:
2789 unreachable("Not reached"); /* zero, one swizzles handled already */
2790 }
2791 }
2792
2793 void
2794 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2795 {
2796 int s = key->tex.swizzles[sampler];
2797
2798 this->result = src_reg(this, ir->type);
2799 dst_reg swizzled_result(this->result);
2800
2801 if (ir->op == ir_query_levels) {
2802 /* # levels is in .w */
2803 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2804 emit(MOV(swizzled_result, orig_val));
2805 return;
2806 }
2807
2808 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2809 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2810 emit(MOV(swizzled_result, orig_val));
2811 return;
2812 }
2813
2814
2815 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2816 int swizzle[4] = {0};
2817
2818 for (int i = 0; i < 4; i++) {
2819 switch (GET_SWZ(s, i)) {
2820 case SWIZZLE_ZERO:
2821 zero_mask |= (1 << i);
2822 break;
2823 case SWIZZLE_ONE:
2824 one_mask |= (1 << i);
2825 break;
2826 default:
2827 copy_mask |= (1 << i);
2828 swizzle[i] = GET_SWZ(s, i);
2829 break;
2830 }
2831 }
2832
2833 if (copy_mask) {
2834 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2835 swizzled_result.writemask = copy_mask;
2836 emit(MOV(swizzled_result, orig_val));
2837 }
2838
2839 if (zero_mask) {
2840 swizzled_result.writemask = zero_mask;
2841 emit(MOV(swizzled_result, src_reg(0.0f)));
2842 }
2843
2844 if (one_mask) {
2845 swizzled_result.writemask = one_mask;
2846 emit(MOV(swizzled_result, src_reg(1.0f)));
2847 }
2848 }
2849
2850 void
2851 vec4_visitor::visit(ir_return *)
2852 {
2853 unreachable("not reached");
2854 }
2855
2856 void
2857 vec4_visitor::visit(ir_discard *)
2858 {
2859 unreachable("not reached");
2860 }
2861
2862 void
2863 vec4_visitor::visit(ir_if *ir)
2864 {
2865 /* Don't point the annotation at the if statement, because then it plus
2866 * the then and else blocks get printed.
2867 */
2868 this->base_ir = ir->condition;
2869
2870 if (brw->gen == 6) {
2871 emit_if_gen6(ir);
2872 } else {
2873 enum brw_predicate predicate;
2874 emit_bool_to_cond_code(ir->condition, &predicate);
2875 emit(IF(predicate));
2876 }
2877
2878 visit_instructions(&ir->then_instructions);
2879
2880 if (!ir->else_instructions.is_empty()) {
2881 this->base_ir = ir->condition;
2882 emit(BRW_OPCODE_ELSE);
2883
2884 visit_instructions(&ir->else_instructions);
2885 }
2886
2887 this->base_ir = ir->condition;
2888 emit(BRW_OPCODE_ENDIF);
2889 }
2890
2891 void
2892 vec4_visitor::visit(ir_emit_vertex *)
2893 {
2894 unreachable("not reached");
2895 }
2896
2897 void
2898 vec4_visitor::visit(ir_end_primitive *)
2899 {
2900 unreachable("not reached");
2901 }
2902
2903 void
2904 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2905 dst_reg dst, src_reg offset,
2906 src_reg src0, src_reg src1)
2907 {
2908 unsigned mlen = 0;
2909
2910 /* Set the atomic operation offset. */
2911 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2912 mlen++;
2913
2914 /* Set the atomic operation arguments. */
2915 if (src0.file != BAD_FILE) {
2916 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2917 mlen++;
2918 }
2919
2920 if (src1.file != BAD_FILE) {
2921 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2922 mlen++;
2923 }
2924
2925 /* Emit the instruction. Note that this maps to the normal SIMD8
2926 * untyped atomic message on Ivy Bridge, but that's OK because
2927 * unused channels will be masked out.
2928 */
2929 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2930 src_reg(atomic_op), src_reg(surf_index));
2931 inst->base_mrf = 0;
2932 inst->mlen = mlen;
2933 }
2934
2935 void
2936 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2937 src_reg offset)
2938 {
2939 /* Set the surface read offset. */
2940 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2941
2942 /* Emit the instruction. Note that this maps to the normal SIMD8
2943 * untyped surface read message, but that's OK because unused
2944 * channels will be masked out.
2945 */
2946 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2947 dst, src_reg(surf_index));
2948 inst->base_mrf = 0;
2949 inst->mlen = 1;
2950 }
2951
2952 void
2953 vec4_visitor::emit_ndc_computation()
2954 {
2955 /* Get the position */
2956 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2957
2958 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2959 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2960 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2961
2962 current_annotation = "NDC";
2963 dst_reg ndc_w = ndc;
2964 ndc_w.writemask = WRITEMASK_W;
2965 src_reg pos_w = pos;
2966 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2967 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2968
2969 dst_reg ndc_xyz = ndc;
2970 ndc_xyz.writemask = WRITEMASK_XYZ;
2971
2972 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2973 }
2974
2975 void
2976 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2977 {
2978 if (brw->gen < 6 &&
2979 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2980 key->userclip_active || brw->has_negative_rhw_bug)) {
2981 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2982 dst_reg header1_w = header1;
2983 header1_w.writemask = WRITEMASK_W;
2984
2985 emit(MOV(header1, 0u));
2986
2987 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2988 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2989
2990 current_annotation = "Point size";
2991 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2992 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2993 }
2994
2995 if (key->userclip_active) {
2996 current_annotation = "Clipping flags";
2997 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2998 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2999
3000 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3001 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3002 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3003
3004 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3005 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3006 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3007 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3008 }
3009
3010 /* i965 clipping workaround:
3011 * 1) Test for -ve rhw
3012 * 2) If set,
3013 * set ndc = (0,0,0,0)
3014 * set ucp[6] = 1
3015 *
3016 * Later, clipping will detect ucp[6] and ensure the primitive is
3017 * clipped against all fixed planes.
3018 */
3019 if (brw->has_negative_rhw_bug) {
3020 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3021 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3022 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3023 vec4_instruction *inst;
3024 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3025 inst->predicate = BRW_PREDICATE_NORMAL;
3026 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3027 inst->predicate = BRW_PREDICATE_NORMAL;
3028 }
3029
3030 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3031 } else if (brw->gen < 6) {
3032 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3033 } else {
3034 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3035 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3036 dst_reg reg_w = reg;
3037 reg_w.writemask = WRITEMASK_W;
3038 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3039 }
3040 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3041 dst_reg reg_y = reg;
3042 reg_y.writemask = WRITEMASK_Y;
3043 reg_y.type = BRW_REGISTER_TYPE_D;
3044 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3045 }
3046 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3047 dst_reg reg_z = reg;
3048 reg_z.writemask = WRITEMASK_Z;
3049 reg_z.type = BRW_REGISTER_TYPE_D;
3050 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3051 }
3052 }
3053 }
3054
3055 void
3056 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3057 {
3058 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3059 *
3060 * "If a linked set of shaders forming the vertex stage contains no
3061 * static write to gl_ClipVertex or gl_ClipDistance, but the
3062 * application has requested clipping against user clip planes through
3063 * the API, then the coordinate written to gl_Position is used for
3064 * comparison against the user clip planes."
3065 *
3066 * This function is only called if the shader didn't write to
3067 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3068 * if the user wrote to it; otherwise we use gl_Position.
3069 */
3070 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3071 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3072 clip_vertex = VARYING_SLOT_POS;
3073 }
3074
3075 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3076 ++i) {
3077 reg.writemask = 1 << i;
3078 emit(DP4(reg,
3079 src_reg(output_reg[clip_vertex]),
3080 src_reg(this->userplane[i + offset])));
3081 }
3082 }
3083
3084 vec4_instruction *
3085 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3086 {
3087 assert (varying < VARYING_SLOT_MAX);
3088 reg.type = output_reg[varying].type;
3089 current_annotation = output_reg_annotation[varying];
3090 /* Copy the register, saturating if necessary */
3091 return emit(MOV(reg, src_reg(output_reg[varying])));
3092 }
3093
3094 void
3095 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3096 {
3097 reg.type = BRW_REGISTER_TYPE_F;
3098
3099 switch (varying) {
3100 case VARYING_SLOT_PSIZ:
3101 {
3102 /* PSIZ is always in slot 0, and is coupled with other flags. */
3103 current_annotation = "indices, point width, clip flags";
3104 emit_psiz_and_flags(reg);
3105 break;
3106 }
3107 case BRW_VARYING_SLOT_NDC:
3108 current_annotation = "NDC";
3109 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3110 break;
3111 case VARYING_SLOT_POS:
3112 current_annotation = "gl_Position";
3113 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3114 break;
3115 case VARYING_SLOT_EDGE:
3116 /* This is present when doing unfilled polygons. We're supposed to copy
3117 * the edge flag from the user-provided vertex array
3118 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3119 * of that attribute (starts as 1.0f). This is then used in clipping to
3120 * determine which edges should be drawn as wireframe.
3121 */
3122 current_annotation = "edge flag";
3123 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3124 glsl_type::float_type, WRITEMASK_XYZW))));
3125 break;
3126 case BRW_VARYING_SLOT_PAD:
3127 /* No need to write to this slot */
3128 break;
3129 case VARYING_SLOT_COL0:
3130 case VARYING_SLOT_COL1:
3131 case VARYING_SLOT_BFC0:
3132 case VARYING_SLOT_BFC1: {
3133 /* These built-in varyings are only supported in compatibility mode,
3134 * and we only support GS in core profile. So, this must be a vertex
3135 * shader.
3136 */
3137 assert(stage == MESA_SHADER_VERTEX);
3138 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3139 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3140 inst->saturate = true;
3141 break;
3142 }
3143
3144 default:
3145 emit_generic_urb_slot(reg, varying);
3146 break;
3147 }
3148 }
3149
3150 static int
3151 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3152 {
3153 if (brw->gen >= 6) {
3154 /* URB data written (does not include the message header reg) must
3155 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3156 * section 5.4.3.2.2: URB_INTERLEAVED.
3157 *
3158 * URB entries are allocated on a multiple of 1024 bits, so an
3159 * extra 128 bits written here to make the end align to 256 is
3160 * no problem.
3161 */
3162 if ((mlen % 2) != 1)
3163 mlen++;
3164 }
3165
3166 return mlen;
3167 }
3168
3169
3170 /**
3171 * Generates the VUE payload plus the necessary URB write instructions to
3172 * output it.
3173 *
3174 * The VUE layout is documented in Volume 2a.
3175 */
3176 void
3177 vec4_visitor::emit_vertex()
3178 {
3179 /* MRF 0 is reserved for the debugger, so start with message header
3180 * in MRF 1.
3181 */
3182 int base_mrf = 1;
3183 int mrf = base_mrf;
3184 /* In the process of generating our URB write message contents, we
3185 * may need to unspill a register or load from an array. Those
3186 * reads would use MRFs 14-15.
3187 */
3188 int max_usable_mrf = 13;
3189
3190 /* The following assertion verifies that max_usable_mrf causes an
3191 * even-numbered amount of URB write data, which will meet gen6's
3192 * requirements for length alignment.
3193 */
3194 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3195
3196 /* First mrf is the g0-based message header containing URB handles and
3197 * such.
3198 */
3199 emit_urb_write_header(mrf++);
3200
3201 if (brw->gen < 6) {
3202 emit_ndc_computation();
3203 }
3204
3205 /* Lower legacy ff and ClipVertex clipping to clip distances */
3206 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3207 current_annotation = "user clip distances";
3208
3209 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3210 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3211
3212 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3213 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3214 }
3215
3216 /* We may need to split this up into several URB writes, so do them in a
3217 * loop.
3218 */
3219 int slot = 0;
3220 bool complete = false;
3221 do {
3222 /* URB offset is in URB row increments, and each of our MRFs is half of
3223 * one of those, since we're doing interleaved writes.
3224 */
3225 int offset = slot / 2;
3226
3227 mrf = base_mrf + 1;
3228 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3229 emit_urb_slot(dst_reg(MRF, mrf++),
3230 prog_data->vue_map.slot_to_varying[slot]);
3231
3232 /* If this was max_usable_mrf, we can't fit anything more into this
3233 * URB WRITE.
3234 */
3235 if (mrf > max_usable_mrf) {
3236 slot++;
3237 break;
3238 }
3239 }
3240
3241 complete = slot >= prog_data->vue_map.num_slots;
3242 current_annotation = "URB write";
3243 vec4_instruction *inst = emit_urb_write_opcode(complete);
3244 inst->base_mrf = base_mrf;
3245 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3246 inst->offset += offset;
3247 } while(!complete);
3248 }
3249
3250
3251 src_reg
3252 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3253 src_reg *reladdr, int reg_offset)
3254 {
3255 /* Because we store the values to scratch interleaved like our
3256 * vertex data, we need to scale the vec4 index by 2.
3257 */
3258 int message_header_scale = 2;
3259
3260 /* Pre-gen6, the message header uses byte offsets instead of vec4
3261 * (16-byte) offset units.
3262 */
3263 if (brw->gen < 6)
3264 message_header_scale *= 16;
3265
3266 if (reladdr) {
3267 src_reg index = src_reg(this, glsl_type::int_type);
3268
3269 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3270 src_reg(reg_offset)));
3271 emit_before(block, inst, MUL(dst_reg(index), index,
3272 src_reg(message_header_scale)));
3273
3274 return index;
3275 } else {
3276 return src_reg(reg_offset * message_header_scale);
3277 }
3278 }
3279
3280 src_reg
3281 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3282 src_reg *reladdr, int reg_offset)
3283 {
3284 if (reladdr) {
3285 src_reg index = src_reg(this, glsl_type::int_type);
3286
3287 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3288 src_reg(reg_offset)));
3289
3290 /* Pre-gen6, the message header uses byte offsets instead of vec4
3291 * (16-byte) offset units.
3292 */
3293 if (brw->gen < 6) {
3294 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3295 }
3296
3297 return index;
3298 } else if (brw->gen >= 8) {
3299 /* Store the offset in a GRF so we can send-from-GRF. */
3300 src_reg offset = src_reg(this, glsl_type::int_type);
3301 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3302 return offset;
3303 } else {
3304 int message_header_scale = brw->gen < 6 ? 16 : 1;
3305 return src_reg(reg_offset * message_header_scale);
3306 }
3307 }
3308
3309 /**
3310 * Emits an instruction before @inst to load the value named by @orig_src
3311 * from scratch space at @base_offset to @temp.
3312 *
3313 * @base_offset is measured in 32-byte units (the size of a register).
3314 */
3315 void
3316 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3317 dst_reg temp, src_reg orig_src,
3318 int base_offset)
3319 {
3320 int reg_offset = base_offset + orig_src.reg_offset;
3321 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3322 reg_offset);
3323
3324 emit_before(block, inst, SCRATCH_READ(temp, index));
3325 }
3326
3327 /**
3328 * Emits an instruction after @inst to store the value to be written
3329 * to @orig_dst to scratch space at @base_offset, from @temp.
3330 *
3331 * @base_offset is measured in 32-byte units (the size of a register).
3332 */
3333 void
3334 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3335 int base_offset)
3336 {
3337 int reg_offset = base_offset + inst->dst.reg_offset;
3338 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3339 reg_offset);
3340
3341 /* Create a temporary register to store *inst's result in.
3342 *
3343 * We have to be careful in MOVing from our temporary result register in
3344 * the scratch write. If we swizzle from channels of the temporary that
3345 * weren't initialized, it will confuse live interval analysis, which will
3346 * make spilling fail to make progress.
3347 */
3348 src_reg temp = src_reg(this, glsl_type::vec4_type);
3349 temp.type = inst->dst.type;
3350 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3351 int swizzles[4];
3352 for (int i = 0; i < 4; i++)
3353 if (inst->dst.writemask & (1 << i))
3354 swizzles[i] = i;
3355 else
3356 swizzles[i] = first_writemask_chan;
3357 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3358 swizzles[2], swizzles[3]);
3359
3360 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3361 inst->dst.writemask));
3362 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3363 write->predicate = inst->predicate;
3364 write->ir = inst->ir;
3365 write->annotation = inst->annotation;
3366 inst->insert_after(block, write);
3367
3368 inst->dst.file = temp.file;
3369 inst->dst.reg = temp.reg;
3370 inst->dst.reg_offset = temp.reg_offset;
3371 inst->dst.reladdr = NULL;
3372 }
3373
3374 /**
3375 * We can't generally support array access in GRF space, because a
3376 * single instruction's destination can only span 2 contiguous
3377 * registers. So, we send all GRF arrays that get variable index
3378 * access to scratch space.
3379 */
3380 void
3381 vec4_visitor::move_grf_array_access_to_scratch()
3382 {
3383 int scratch_loc[this->alloc.count];
3384 memset(scratch_loc, -1, sizeof(scratch_loc));
3385
3386 /* First, calculate the set of virtual GRFs that need to be punted
3387 * to scratch due to having any array access on them, and where in
3388 * scratch.
3389 */
3390 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3391 if (inst->dst.file == GRF && inst->dst.reladdr &&
3392 scratch_loc[inst->dst.reg] == -1) {
3393 scratch_loc[inst->dst.reg] = c->last_scratch;
3394 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3395 }
3396
3397 for (int i = 0 ; i < 3; i++) {
3398 src_reg *src = &inst->src[i];
3399
3400 if (src->file == GRF && src->reladdr &&
3401 scratch_loc[src->reg] == -1) {
3402 scratch_loc[src->reg] = c->last_scratch;
3403 c->last_scratch += this->alloc.sizes[src->reg];
3404 }
3405 }
3406 }
3407
3408 /* Now, for anything that will be accessed through scratch, rewrite
3409 * it to load/store. Note that this is a _safe list walk, because
3410 * we may generate a new scratch_write instruction after the one
3411 * we're processing.
3412 */
3413 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3414 /* Set up the annotation tracking for new generated instructions. */
3415 base_ir = inst->ir;
3416 current_annotation = inst->annotation;
3417
3418 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3419 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3420 }
3421
3422 for (int i = 0 ; i < 3; i++) {
3423 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3424 continue;
3425
3426 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3427
3428 emit_scratch_read(block, inst, temp, inst->src[i],
3429 scratch_loc[inst->src[i].reg]);
3430
3431 inst->src[i].file = temp.file;
3432 inst->src[i].reg = temp.reg;
3433 inst->src[i].reg_offset = temp.reg_offset;
3434 inst->src[i].reladdr = NULL;
3435 }
3436 }
3437 }
3438
3439 /**
3440 * Emits an instruction before @inst to load the value named by @orig_src
3441 * from the pull constant buffer (surface) at @base_offset to @temp.
3442 */
3443 void
3444 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3445 dst_reg temp, src_reg orig_src,
3446 int base_offset)
3447 {
3448 int reg_offset = base_offset + orig_src.reg_offset;
3449 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3450 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3451 reg_offset);
3452 vec4_instruction *load;
3453
3454 if (brw->gen >= 7) {
3455 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3456 grf_offset.type = offset.type;
3457 emit_before(block, inst, MOV(grf_offset, offset));
3458
3459 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3460 temp, index, src_reg(grf_offset));
3461 load->mlen = 1;
3462 } else {
3463 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3464 temp, index, offset);
3465 load->base_mrf = 14;
3466 load->mlen = 1;
3467 }
3468 emit_before(block, inst, load);
3469 }
3470
3471 /**
3472 * Implements array access of uniforms by inserting a
3473 * PULL_CONSTANT_LOAD instruction.
3474 *
3475 * Unlike temporary GRF array access (where we don't support it due to
3476 * the difficulty of doing relative addressing on instruction
3477 * destinations), we could potentially do array access of uniforms
3478 * that were loaded in GRF space as push constants. In real-world
3479 * usage we've seen, though, the arrays being used are always larger
3480 * than we could load as push constants, so just always move all
3481 * uniform array access out to a pull constant buffer.
3482 */
3483 void
3484 vec4_visitor::move_uniform_array_access_to_pull_constants()
3485 {
3486 int pull_constant_loc[this->uniforms];
3487 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3488 bool nested_reladdr;
3489
3490 /* Walk through and find array access of uniforms. Put a copy of that
3491 * uniform in the pull constant buffer.
3492 *
3493 * Note that we don't move constant-indexed accesses to arrays. No
3494 * testing has been done of the performance impact of this choice.
3495 */
3496 do {
3497 nested_reladdr = false;
3498
3499 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3500 for (int i = 0 ; i < 3; i++) {
3501 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3502 continue;
3503
3504 int uniform = inst->src[i].reg;
3505
3506 if (inst->src[i].reladdr->reladdr)
3507 nested_reladdr = true; /* will need another pass */
3508
3509 /* If this array isn't already present in the pull constant buffer,
3510 * add it.
3511 */
3512 if (pull_constant_loc[uniform] == -1) {
3513 const gl_constant_value **values =
3514 &stage_prog_data->param[uniform * 4];
3515
3516 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3517
3518 assert(uniform < uniform_array_size);
3519 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3520 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3521 = values[j];
3522 }
3523 }
3524
3525 /* Set up the annotation tracking for new generated instructions. */
3526 base_ir = inst->ir;
3527 current_annotation = inst->annotation;
3528
3529 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3530
3531 emit_pull_constant_load(block, inst, temp, inst->src[i],
3532 pull_constant_loc[uniform]);
3533
3534 inst->src[i].file = temp.file;
3535 inst->src[i].reg = temp.reg;
3536 inst->src[i].reg_offset = temp.reg_offset;
3537 inst->src[i].reladdr = NULL;
3538 }
3539 }
3540 } while (nested_reladdr);
3541
3542 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3543 * no need to track them as larger-than-vec4 objects. This will be
3544 * relied on in cutting out unused uniform vectors from push
3545 * constants.
3546 */
3547 split_uniform_registers();
3548 }
3549
3550 void
3551 vec4_visitor::resolve_ud_negate(src_reg *reg)
3552 {
3553 if (reg->type != BRW_REGISTER_TYPE_UD ||
3554 !reg->negate)
3555 return;
3556
3557 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3558 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3559 *reg = temp;
3560 }
3561
3562 /**
3563 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3564 *
3565 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3566 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3567 */
3568 void
3569 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3570 {
3571 assert(brw->gen <= 5);
3572
3573 if (!rvalue->type->is_boolean())
3574 return;
3575
3576 src_reg and_result = src_reg(this, rvalue->type);
3577 src_reg neg_result = src_reg(this, rvalue->type);
3578 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3579 emit(MOV(dst_reg(neg_result), negate(and_result)));
3580 *reg = neg_result;
3581 }
3582
3583 vec4_visitor::vec4_visitor(struct brw_context *brw,
3584 struct brw_vec4_compile *c,
3585 struct gl_program *prog,
3586 const struct brw_vue_prog_key *key,
3587 struct brw_vue_prog_data *prog_data,
3588 struct gl_shader_program *shader_prog,
3589 gl_shader_stage stage,
3590 void *mem_ctx,
3591 bool no_spills,
3592 shader_time_shader_type st_base,
3593 shader_time_shader_type st_written,
3594 shader_time_shader_type st_reset)
3595 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3596 c(c),
3597 key(key),
3598 prog_data(prog_data),
3599 sanity_param_count(0),
3600 fail_msg(NULL),
3601 first_non_payload_grf(0),
3602 need_all_constants_in_pull_buffer(false),
3603 no_spills(no_spills),
3604 st_base(st_base),
3605 st_written(st_written),
3606 st_reset(st_reset)
3607 {
3608 this->mem_ctx = mem_ctx;
3609 this->failed = false;
3610
3611 this->base_ir = NULL;
3612 this->current_annotation = NULL;
3613 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3614
3615 this->variable_ht = hash_table_ctor(0,
3616 hash_table_pointer_hash,
3617 hash_table_pointer_compare);
3618
3619 this->virtual_grf_start = NULL;
3620 this->virtual_grf_end = NULL;
3621 this->live_intervals = NULL;
3622
3623 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3624
3625 this->uniforms = 0;
3626
3627 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3628 * at least one. See setup_uniforms() in brw_vec4.cpp.
3629 */
3630 this->uniform_array_size = 1;
3631 if (prog_data) {
3632 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3633 }
3634
3635 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3636 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3637 }
3638
3639 vec4_visitor::~vec4_visitor()
3640 {
3641 hash_table_dtor(this->variable_ht);
3642 }
3643
3644
3645 void
3646 vec4_visitor::fail(const char *format, ...)
3647 {
3648 va_list va;
3649 char *msg;
3650
3651 if (failed)
3652 return;
3653
3654 failed = true;
3655
3656 va_start(va, format);
3657 msg = ralloc_vasprintf(mem_ctx, format, va);
3658 va_end(va);
3659 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3660
3661 this->fail_msg = msg;
3662
3663 if (debug_enabled) {
3664 fprintf(stderr, "%s", msg);
3665 }
3666 }
3667
3668 } /* namespace brw */