i965/vec4: Initialize vec4_instruction::predicate and ::predicate_inverse.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(brw->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (brw->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (brw->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (brw->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(MOV(f, src_reg(shifted)));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(MOV(f, src_reg(shifted)));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_ERROR:
616 case GLSL_TYPE_INTERFACE:
617 unreachable("not reached");
618 }
619
620 return 0;
621 }
622
623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
624 {
625 init();
626
627 this->file = GRF;
628 this->reg = v->alloc.allocate(type_size(type));
629
630 if (type->is_array() || type->is_record()) {
631 this->swizzle = BRW_SWIZZLE_NOOP;
632 } else {
633 this->swizzle = swizzle_for_size(type->vector_elements);
634 }
635
636 this->type = brw_type_for_base_type(type);
637 }
638
639 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
640 {
641 assert(size > 0);
642
643 init();
644
645 this->file = GRF;
646 this->reg = v->alloc.allocate(type_size(type) * size);
647
648 this->swizzle = BRW_SWIZZLE_NOOP;
649
650 this->type = brw_type_for_base_type(type);
651 }
652
653 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
654 {
655 init();
656
657 this->file = GRF;
658 this->reg = v->alloc.allocate(type_size(type));
659
660 if (type->is_array() || type->is_record()) {
661 this->writemask = WRITEMASK_XYZW;
662 } else {
663 this->writemask = (1 << type->vector_elements) - 1;
664 }
665
666 this->type = brw_type_for_base_type(type);
667 }
668
669 /* Our support for uniforms is piggy-backed on the struct
670 * gl_fragment_program, because that's where the values actually
671 * get stored, rather than in some global gl_shader_program uniform
672 * store.
673 */
674 void
675 vec4_visitor::setup_uniform_values(ir_variable *ir)
676 {
677 int namelen = strlen(ir->name);
678
679 /* The data for our (non-builtin) uniforms is stored in a series of
680 * gl_uniform_driver_storage structs for each subcomponent that
681 * glGetUniformLocation() could name. We know it's been set up in the same
682 * order we'd walk the type, so walk the list of storage and find anything
683 * with our name, or the prefix of a component that starts with our name.
684 */
685 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
686 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
687
688 if (strncmp(ir->name, storage->name, namelen) != 0 ||
689 (storage->name[namelen] != 0 &&
690 storage->name[namelen] != '.' &&
691 storage->name[namelen] != '[')) {
692 continue;
693 }
694
695 gl_constant_value *components = storage->storage;
696 unsigned vector_count = (MAX2(storage->array_elements, 1) *
697 storage->type->matrix_columns);
698
699 for (unsigned s = 0; s < vector_count; s++) {
700 assert(uniforms < uniform_array_size);
701 uniform_vector_size[uniforms] = storage->type->vector_elements;
702
703 int i;
704 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
705 stage_prog_data->param[uniforms * 4 + i] = components;
706 components++;
707 }
708 for (; i < 4; i++) {
709 static gl_constant_value zero = { 0.0 };
710 stage_prog_data->param[uniforms * 4 + i] = &zero;
711 }
712
713 uniforms++;
714 }
715 }
716 }
717
718 void
719 vec4_visitor::setup_uniform_clipplane_values()
720 {
721 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
722
723 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
724 assert(this->uniforms < uniform_array_size);
725 this->uniform_vector_size[this->uniforms] = 4;
726 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
727 this->userplane[i].type = BRW_REGISTER_TYPE_F;
728 for (int j = 0; j < 4; ++j) {
729 stage_prog_data->param[this->uniforms * 4 + j] =
730 (gl_constant_value *) &clip_planes[i][j];
731 }
732 ++this->uniforms;
733 }
734 }
735
736 /* Our support for builtin uniforms is even scarier than non-builtin.
737 * It sits on top of the PROG_STATE_VAR parameters that are
738 * automatically updated from GL context state.
739 */
740 void
741 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
742 {
743 const ir_state_slot *const slots = ir->get_state_slots();
744 assert(slots != NULL);
745
746 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
747 /* This state reference has already been setup by ir_to_mesa,
748 * but we'll get the same index back here. We can reference
749 * ParameterValues directly, since unlike brw_fs.cpp, we never
750 * add new state references during compile.
751 */
752 int index = _mesa_add_state_reference(this->prog->Parameters,
753 (gl_state_index *)slots[i].tokens);
754 gl_constant_value *values =
755 &this->prog->Parameters->ParameterValues[index][0];
756
757 assert(this->uniforms < uniform_array_size);
758 this->uniform_vector_size[this->uniforms] = 0;
759 /* Add each of the unique swizzled channels of the element.
760 * This will end up matching the size of the glsl_type of this field.
761 */
762 int last_swiz = -1;
763 for (unsigned int j = 0; j < 4; j++) {
764 int swiz = GET_SWZ(slots[i].swizzle, j);
765 last_swiz = swiz;
766
767 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
768 assert(this->uniforms < uniform_array_size);
769 if (swiz <= last_swiz)
770 this->uniform_vector_size[this->uniforms]++;
771 }
772 this->uniforms++;
773 }
774 }
775
776 dst_reg *
777 vec4_visitor::variable_storage(ir_variable *var)
778 {
779 return (dst_reg *)hash_table_find(this->variable_ht, var);
780 }
781
782 void
783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
784 enum brw_predicate *predicate)
785 {
786 ir_expression *expr = ir->as_expression();
787
788 *predicate = BRW_PREDICATE_NORMAL;
789
790 if (expr && expr->operation != ir_binop_ubo_load) {
791 src_reg op[3];
792 vec4_instruction *inst;
793
794 assert(expr->get_num_operands() <= 3);
795 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
796 expr->operands[i]->accept(this);
797 op[i] = this->result;
798
799 resolve_ud_negate(&op[i]);
800 }
801
802 switch (expr->operation) {
803 case ir_unop_logic_not:
804 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
805 inst->conditional_mod = BRW_CONDITIONAL_Z;
806 break;
807
808 case ir_binop_logic_xor:
809 if (brw->gen <= 5) {
810 src_reg temp = src_reg(this, ir->type);
811 emit(XOR(dst_reg(temp), op[0], op[1]));
812 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
813 } else {
814 inst = emit(XOR(dst_null_d(), op[0], op[1]));
815 }
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 break;
818
819 case ir_binop_logic_or:
820 if (brw->gen <= 5) {
821 src_reg temp = src_reg(this, ir->type);
822 emit(OR(dst_reg(temp), op[0], op[1]));
823 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
824 } else {
825 inst = emit(OR(dst_null_d(), op[0], op[1]));
826 }
827 inst->conditional_mod = BRW_CONDITIONAL_NZ;
828 break;
829
830 case ir_binop_logic_and:
831 if (brw->gen <= 5) {
832 src_reg temp = src_reg(this, ir->type);
833 emit(AND(dst_reg(temp), op[0], op[1]));
834 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
835 } else {
836 inst = emit(AND(dst_null_d(), op[0], op[1]));
837 }
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 break;
840
841 case ir_unop_f2b:
842 if (brw->gen >= 6) {
843 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
844 } else {
845 inst = emit(MOV(dst_null_f(), op[0]));
846 inst->conditional_mod = BRW_CONDITIONAL_NZ;
847 }
848 break;
849
850 case ir_unop_i2b:
851 if (brw->gen >= 6) {
852 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
853 } else {
854 inst = emit(MOV(dst_null_d(), op[0]));
855 inst->conditional_mod = BRW_CONDITIONAL_NZ;
856 }
857 break;
858
859 case ir_binop_all_equal:
860 if (brw->gen <= 5) {
861 resolve_bool_comparison(expr->operands[0], &op[0]);
862 resolve_bool_comparison(expr->operands[1], &op[1]);
863 }
864 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
865 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
866 break;
867
868 case ir_binop_any_nequal:
869 if (brw->gen <= 5) {
870 resolve_bool_comparison(expr->operands[0], &op[0]);
871 resolve_bool_comparison(expr->operands[1], &op[1]);
872 }
873 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
874 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
875 break;
876
877 case ir_unop_any:
878 if (brw->gen <= 5) {
879 resolve_bool_comparison(expr->operands[0], &op[0]);
880 }
881 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
882 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
883 break;
884
885 case ir_binop_greater:
886 case ir_binop_gequal:
887 case ir_binop_less:
888 case ir_binop_lequal:
889 case ir_binop_equal:
890 case ir_binop_nequal:
891 if (brw->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 resolve_bool_comparison(expr->operands[1], &op[1]);
894 }
895 emit(CMP(dst_null_d(), op[0], op[1],
896 brw_conditional_for_comparison(expr->operation)));
897 break;
898
899 case ir_triop_csel: {
900 /* Expand the boolean condition into the flag register. */
901 inst = emit(MOV(dst_null_d(), op[0]));
902 inst->conditional_mod = BRW_CONDITIONAL_NZ;
903
904 /* Select which boolean to return. */
905 dst_reg temp(this, expr->operands[1]->type);
906 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
907 inst->predicate = BRW_PREDICATE_NORMAL;
908
909 /* Expand the result to a condition code. */
910 inst = emit(MOV(dst_null_d(), src_reg(temp)));
911 inst->conditional_mod = BRW_CONDITIONAL_NZ;
912 break;
913 }
914
915 default:
916 unreachable("not reached");
917 }
918 return;
919 }
920
921 ir->accept(this);
922
923 resolve_ud_negate(&this->result);
924
925 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
926 inst->conditional_mod = BRW_CONDITIONAL_NZ;
927 }
928
929 /**
930 * Emit a gen6 IF statement with the comparison folded into the IF
931 * instruction.
932 */
933 void
934 vec4_visitor::emit_if_gen6(ir_if *ir)
935 {
936 ir_expression *expr = ir->condition->as_expression();
937
938 if (expr && expr->operation != ir_binop_ubo_load) {
939 src_reg op[3];
940 dst_reg temp;
941
942 assert(expr->get_num_operands() <= 3);
943 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
944 expr->operands[i]->accept(this);
945 op[i] = this->result;
946 }
947
948 switch (expr->operation) {
949 case ir_unop_logic_not:
950 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
951 return;
952
953 case ir_binop_logic_xor:
954 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
955 return;
956
957 case ir_binop_logic_or:
958 temp = dst_reg(this, glsl_type::bool_type);
959 emit(OR(temp, op[0], op[1]));
960 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
961 return;
962
963 case ir_binop_logic_and:
964 temp = dst_reg(this, glsl_type::bool_type);
965 emit(AND(temp, op[0], op[1]));
966 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_f2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_unop_i2b:
974 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
975 return;
976
977 case ir_binop_greater:
978 case ir_binop_gequal:
979 case ir_binop_less:
980 case ir_binop_lequal:
981 case ir_binop_equal:
982 case ir_binop_nequal:
983 emit(IF(op[0], op[1],
984 brw_conditional_for_comparison(expr->operation)));
985 return;
986
987 case ir_binop_all_equal:
988 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
989 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
990 return;
991
992 case ir_binop_any_nequal:
993 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
994 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
995 return;
996
997 case ir_unop_any:
998 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
999 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000 return;
1001
1002 case ir_triop_csel: {
1003 /* Expand the boolean condition into the flag register. */
1004 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007 /* Select which boolean to return. */
1008 dst_reg temp(this, expr->operands[1]->type);
1009 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010 inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013 return;
1014 }
1015
1016 default:
1017 unreachable("not reached");
1018 }
1019 return;
1020 }
1021
1022 ir->condition->accept(this);
1023
1024 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030 dst_reg *reg = NULL;
1031
1032 if (variable_storage(ir))
1033 return;
1034
1035 switch (ir->data.mode) {
1036 case ir_var_shader_in:
1037 assert(ir->data.location != -1);
1038 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039 break;
1040
1041 case ir_var_shader_out:
1042 assert(ir->data.location != -1);
1043 reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045 for (int i = 0; i < type_size(ir->type); i++) {
1046 output_reg[ir->data.location + i] = *reg;
1047 output_reg[ir->data.location + i].reg_offset = i;
1048 output_reg[ir->data.location + i].type =
1049 brw_type_for_base_type(ir->type->get_scalar_type());
1050 output_reg_annotation[ir->data.location + i] = ir->name;
1051 }
1052 break;
1053
1054 case ir_var_auto:
1055 case ir_var_temporary:
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057 break;
1058
1059 case ir_var_uniform:
1060 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062 /* Thanks to the lower_ubo_reference pass, we will see only
1063 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064 * variables, so no need for them to be in variable_ht.
1065 *
1066 * Some uniforms, such as samplers and atomic counters, have no actual
1067 * storage, so we should ignore them.
1068 */
1069 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1070 return;
1071
1072 /* Track how big the whole uniform variable is, in case we need to put a
1073 * copy of its data into pull constants for array access.
1074 */
1075 assert(this->uniforms < uniform_array_size);
1076 this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078 if (!strncmp(ir->name, "gl_", 3)) {
1079 setup_builtin_uniform_values(ir);
1080 } else {
1081 setup_uniform_values(ir);
1082 }
1083 break;
1084
1085 case ir_var_system_value:
1086 reg = make_reg_for_system_value(ir);
1087 break;
1088
1089 default:
1090 unreachable("not reached");
1091 }
1092
1093 reg->type = brw_type_for_base_type(ir->type);
1094 hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100 /* We don't want debugging output to print the whole body of the
1101 * loop as the annotation.
1102 */
1103 this->base_ir = NULL;
1104
1105 emit(BRW_OPCODE_DO);
1106
1107 visit_instructions(&ir->body_instructions);
1108
1109 emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115 switch (ir->mode) {
1116 case ir_loop_jump::jump_break:
1117 emit(BRW_OPCODE_BREAK);
1118 break;
1119 case ir_loop_jump::jump_continue:
1120 emit(BRW_OPCODE_CONTINUE);
1121 break;
1122 }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129 unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135 /* Ignore function bodies other than main() -- we shouldn't see calls to
1136 * them since they should all be inlined.
1137 */
1138 if (strcmp(ir->name, "main") == 0) {
1139 const ir_function_signature *sig;
1140 exec_list empty;
1141
1142 sig = ir->matching_signature(NULL, &empty, false);
1143
1144 assert(sig);
1145
1146 visit_instructions(&sig->body);
1147 }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153 /* 3-src instructions were introduced in gen6. */
1154 if (brw->gen < 6)
1155 return false;
1156
1157 /* MAD can only handle floating-point data. */
1158 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159 return false;
1160
1161 ir_rvalue *nonmul = ir->operands[1];
1162 ir_expression *mul = ir->operands[0]->as_expression();
1163
1164 if (!mul || mul->operation != ir_binop_mul) {
1165 nonmul = ir->operands[0];
1166 mul = ir->operands[1]->as_expression();
1167
1168 if (!mul || mul->operation != ir_binop_mul)
1169 return false;
1170 }
1171
1172 nonmul->accept(this);
1173 src_reg src0 = fix_3src_operand(this->result);
1174
1175 mul->operands[0]->accept(this);
1176 src_reg src1 = fix_3src_operand(this->result);
1177
1178 mul->operands[1]->accept(this);
1179 src_reg src2 = fix_3src_operand(this->result);
1180
1181 this->result = src_reg(this, ir->type);
1182 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1183
1184 return true;
1185 }
1186
1187 bool
1188 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1189 {
1190 /* This optimization relies on CMP setting the destination to 0 when
1191 * false. Early hardware only sets the least significant bit, and
1192 * leaves the other bits undefined. So we can't use it.
1193 */
1194 if (brw->gen < 6)
1195 return false;
1196
1197 ir_expression *const cmp = ir->operands[0]->as_expression();
1198
1199 if (cmp == NULL)
1200 return false;
1201
1202 switch (cmp->operation) {
1203 case ir_binop_less:
1204 case ir_binop_greater:
1205 case ir_binop_lequal:
1206 case ir_binop_gequal:
1207 case ir_binop_equal:
1208 case ir_binop_nequal:
1209 break;
1210
1211 default:
1212 return false;
1213 }
1214
1215 cmp->operands[0]->accept(this);
1216 const src_reg cmp_src0 = this->result;
1217
1218 cmp->operands[1]->accept(this);
1219 const src_reg cmp_src1 = this->result;
1220
1221 this->result = src_reg(this, ir->type);
1222
1223 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1224 brw_conditional_for_comparison(cmp->operation)));
1225
1226 /* If the comparison is false, this->result will just happen to be zero.
1227 */
1228 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1229 this->result, src_reg(1.0f));
1230 inst->predicate = BRW_PREDICATE_NORMAL;
1231 inst->predicate_inverse = true;
1232
1233 return true;
1234 }
1235
1236 void
1237 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1238 src_reg src0, src_reg src1)
1239 {
1240 vec4_instruction *inst;
1241
1242 if (brw->gen >= 6) {
1243 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1244 inst->conditional_mod = conditionalmod;
1245 } else {
1246 emit(CMP(dst, src0, src1, conditionalmod));
1247
1248 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1249 inst->predicate = BRW_PREDICATE_NORMAL;
1250 }
1251 }
1252
1253 void
1254 vec4_visitor::emit_lrp(const dst_reg &dst,
1255 const src_reg &x, const src_reg &y, const src_reg &a)
1256 {
1257 if (brw->gen >= 6) {
1258 /* Note that the instruction's argument order is reversed from GLSL
1259 * and the IR.
1260 */
1261 emit(LRP(dst,
1262 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1263 } else {
1264 /* Earlier generations don't support three source operations, so we
1265 * need to emit x*(1-a) + y*a.
1266 */
1267 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1268 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1269 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1270 y_times_a.writemask = dst.writemask;
1271 one_minus_a.writemask = dst.writemask;
1272 x_times_one_minus_a.writemask = dst.writemask;
1273
1274 emit(MUL(y_times_a, y, a));
1275 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1276 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1277 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1278 }
1279 }
1280
1281 void
1282 vec4_visitor::visit(ir_expression *ir)
1283 {
1284 unsigned int operand;
1285 src_reg op[Elements(ir->operands)];
1286 vec4_instruction *inst;
1287
1288 if (ir->operation == ir_binop_add) {
1289 if (try_emit_mad(ir))
1290 return;
1291 }
1292
1293 if (ir->operation == ir_unop_b2f) {
1294 if (try_emit_b2f_of_compare(ir))
1295 return;
1296 }
1297
1298 /* Storage for our result. Ideally for an assignment we'd be using
1299 * the actual storage for the result here, instead.
1300 */
1301 dst_reg result_dst(this, ir->type);
1302 src_reg result_src(result_dst);
1303
1304 if (ir->operation == ir_triop_csel) {
1305 ir->operands[1]->accept(this);
1306 op[1] = this->result;
1307 ir->operands[2]->accept(this);
1308 op[2] = this->result;
1309
1310 enum brw_predicate predicate;
1311 emit_bool_to_cond_code(ir->operands[0], &predicate);
1312 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1313 inst->predicate = predicate;
1314 this->result = result_src;
1315 return;
1316 }
1317
1318 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1319 this->result.file = BAD_FILE;
1320 ir->operands[operand]->accept(this);
1321 if (this->result.file == BAD_FILE) {
1322 fprintf(stderr, "Failed to get tree for expression operand:\n");
1323 ir->operands[operand]->fprint(stderr);
1324 exit(1);
1325 }
1326 op[operand] = this->result;
1327
1328 /* Matrix expression operands should have been broken down to vector
1329 * operations already.
1330 */
1331 assert(!ir->operands[operand]->type->is_matrix());
1332 }
1333
1334 /* If nothing special happens, this is the result. */
1335 this->result = result_src;
1336
1337 switch (ir->operation) {
1338 case ir_unop_logic_not:
1339 emit(NOT(result_dst, op[0]));
1340 break;
1341 case ir_unop_neg:
1342 op[0].negate = !op[0].negate;
1343 emit(MOV(result_dst, op[0]));
1344 break;
1345 case ir_unop_abs:
1346 op[0].abs = true;
1347 op[0].negate = false;
1348 emit(MOV(result_dst, op[0]));
1349 break;
1350
1351 case ir_unop_sign:
1352 if (ir->type->is_float()) {
1353 /* AND(val, 0x80000000) gives the sign bit.
1354 *
1355 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1356 * zero.
1357 */
1358 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1359
1360 op[0].type = BRW_REGISTER_TYPE_UD;
1361 result_dst.type = BRW_REGISTER_TYPE_UD;
1362 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1363
1364 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1365 inst->predicate = BRW_PREDICATE_NORMAL;
1366
1367 this->result.type = BRW_REGISTER_TYPE_F;
1368 } else {
1369 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1370 * -> non-negative val generates 0x00000000.
1371 * Predicated OR sets 1 if val is positive.
1372 */
1373 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1374
1375 emit(ASR(result_dst, op[0], src_reg(31)));
1376
1377 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1378 inst->predicate = BRW_PREDICATE_NORMAL;
1379 }
1380 break;
1381
1382 case ir_unop_rcp:
1383 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1384 break;
1385
1386 case ir_unop_exp2:
1387 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1388 break;
1389 case ir_unop_log2:
1390 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1391 break;
1392 case ir_unop_exp:
1393 case ir_unop_log:
1394 unreachable("not reached: should be handled by ir_explog_to_explog2");
1395 case ir_unop_sin:
1396 case ir_unop_sin_reduced:
1397 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1398 break;
1399 case ir_unop_cos:
1400 case ir_unop_cos_reduced:
1401 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1402 break;
1403
1404 case ir_unop_dFdx:
1405 case ir_unop_dFdx_coarse:
1406 case ir_unop_dFdx_fine:
1407 case ir_unop_dFdy:
1408 case ir_unop_dFdy_coarse:
1409 case ir_unop_dFdy_fine:
1410 unreachable("derivatives not valid in vertex shader");
1411
1412 case ir_unop_bitfield_reverse:
1413 emit(BFREV(result_dst, op[0]));
1414 break;
1415 case ir_unop_bit_count:
1416 emit(CBIT(result_dst, op[0]));
1417 break;
1418 case ir_unop_find_msb: {
1419 src_reg temp = src_reg(this, glsl_type::uint_type);
1420
1421 inst = emit(FBH(dst_reg(temp), op[0]));
1422 inst->dst.writemask = WRITEMASK_XYZW;
1423
1424 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1425 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1426 * subtract the result from 31 to convert the MSB count into an LSB count.
1427 */
1428
1429 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1430 temp.swizzle = BRW_SWIZZLE_NOOP;
1431 emit(MOV(result_dst, temp));
1432
1433 src_reg src_tmp = src_reg(result_dst);
1434 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1435
1436 src_tmp.negate = true;
1437 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1438 inst->predicate = BRW_PREDICATE_NORMAL;
1439 break;
1440 }
1441 case ir_unop_find_lsb:
1442 emit(FBL(result_dst, op[0]));
1443 break;
1444 case ir_unop_saturate:
1445 inst = emit(MOV(result_dst, op[0]));
1446 inst->saturate = true;
1447 break;
1448
1449 case ir_unop_noise:
1450 unreachable("not reached: should be handled by lower_noise");
1451
1452 case ir_binop_add:
1453 emit(ADD(result_dst, op[0], op[1]));
1454 break;
1455 case ir_binop_sub:
1456 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1457
1458 case ir_binop_mul:
1459 if (brw->gen < 8 && ir->type->is_integer()) {
1460 /* For integer multiplication, the MUL uses the low 16 bits of one of
1461 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1462 * accumulates in the contribution of the upper 16 bits of that
1463 * operand. If we can determine that one of the args is in the low
1464 * 16 bits, though, we can just emit a single MUL.
1465 */
1466 if (ir->operands[0]->is_uint16_constant()) {
1467 if (brw->gen < 7)
1468 emit(MUL(result_dst, op[0], op[1]));
1469 else
1470 emit(MUL(result_dst, op[1], op[0]));
1471 } else if (ir->operands[1]->is_uint16_constant()) {
1472 if (brw->gen < 7)
1473 emit(MUL(result_dst, op[1], op[0]));
1474 else
1475 emit(MUL(result_dst, op[0], op[1]));
1476 } else {
1477 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1478
1479 emit(MUL(acc, op[0], op[1]));
1480 emit(MACH(dst_null_d(), op[0], op[1]));
1481 emit(MOV(result_dst, src_reg(acc)));
1482 }
1483 } else {
1484 emit(MUL(result_dst, op[0], op[1]));
1485 }
1486 break;
1487 case ir_binop_imul_high: {
1488 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1489
1490 emit(MUL(acc, op[0], op[1]));
1491 emit(MACH(result_dst, op[0], op[1]));
1492 break;
1493 }
1494 case ir_binop_div:
1495 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1496 assert(ir->type->is_integer());
1497 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1498 break;
1499 case ir_binop_carry: {
1500 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1501
1502 emit(ADDC(dst_null_ud(), op[0], op[1]));
1503 emit(MOV(result_dst, src_reg(acc)));
1504 break;
1505 }
1506 case ir_binop_borrow: {
1507 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1508
1509 emit(SUBB(dst_null_ud(), op[0], op[1]));
1510 emit(MOV(result_dst, src_reg(acc)));
1511 break;
1512 }
1513 case ir_binop_mod:
1514 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1515 assert(ir->type->is_integer());
1516 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1517 break;
1518
1519 case ir_binop_less:
1520 case ir_binop_greater:
1521 case ir_binop_lequal:
1522 case ir_binop_gequal:
1523 case ir_binop_equal:
1524 case ir_binop_nequal: {
1525 if (brw->gen <= 5) {
1526 resolve_bool_comparison(ir->operands[0], &op[0]);
1527 resolve_bool_comparison(ir->operands[1], &op[1]);
1528 }
1529 emit(CMP(result_dst, op[0], op[1],
1530 brw_conditional_for_comparison(ir->operation)));
1531 break;
1532 }
1533
1534 case ir_binop_all_equal:
1535 /* "==" operator producing a scalar boolean. */
1536 if (ir->operands[0]->type->is_vector() ||
1537 ir->operands[1]->type->is_vector()) {
1538 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1539 emit(MOV(result_dst, src_reg(0)));
1540 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1541 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1542 } else {
1543 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1544 }
1545 break;
1546 case ir_binop_any_nequal:
1547 /* "!=" operator producing a scalar boolean. */
1548 if (ir->operands[0]->type->is_vector() ||
1549 ir->operands[1]->type->is_vector()) {
1550 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1551
1552 emit(MOV(result_dst, src_reg(0)));
1553 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1554 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1555 } else {
1556 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1557 }
1558 break;
1559
1560 case ir_unop_any:
1561 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1562 emit(MOV(result_dst, src_reg(0)));
1563
1564 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1565 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1566 break;
1567
1568 case ir_binop_logic_xor:
1569 emit(XOR(result_dst, op[0], op[1]));
1570 break;
1571
1572 case ir_binop_logic_or:
1573 emit(OR(result_dst, op[0], op[1]));
1574 break;
1575
1576 case ir_binop_logic_and:
1577 emit(AND(result_dst, op[0], op[1]));
1578 break;
1579
1580 case ir_binop_dot:
1581 assert(ir->operands[0]->type->is_vector());
1582 assert(ir->operands[0]->type == ir->operands[1]->type);
1583 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1584 break;
1585
1586 case ir_unop_sqrt:
1587 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1588 break;
1589 case ir_unop_rsq:
1590 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1591 break;
1592
1593 case ir_unop_bitcast_i2f:
1594 case ir_unop_bitcast_u2f:
1595 this->result = op[0];
1596 this->result.type = BRW_REGISTER_TYPE_F;
1597 break;
1598
1599 case ir_unop_bitcast_f2i:
1600 this->result = op[0];
1601 this->result.type = BRW_REGISTER_TYPE_D;
1602 break;
1603
1604 case ir_unop_bitcast_f2u:
1605 this->result = op[0];
1606 this->result.type = BRW_REGISTER_TYPE_UD;
1607 break;
1608
1609 case ir_unop_i2f:
1610 case ir_unop_i2u:
1611 case ir_unop_u2i:
1612 case ir_unop_u2f:
1613 case ir_unop_f2i:
1614 case ir_unop_f2u:
1615 emit(MOV(result_dst, op[0]));
1616 break;
1617 case ir_unop_b2i:
1618 emit(AND(result_dst, op[0], src_reg(1)));
1619 break;
1620 case ir_unop_b2f:
1621 if (brw->gen <= 5) {
1622 resolve_bool_comparison(ir->operands[0], &op[0]);
1623 }
1624 op[0].type = BRW_REGISTER_TYPE_D;
1625 result_dst.type = BRW_REGISTER_TYPE_D;
1626 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1627 result_dst.type = BRW_REGISTER_TYPE_F;
1628 break;
1629 case ir_unop_f2b:
1630 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1631 break;
1632 case ir_unop_i2b:
1633 emit(AND(result_dst, op[0], src_reg(1)));
1634 break;
1635
1636 case ir_unop_trunc:
1637 emit(RNDZ(result_dst, op[0]));
1638 break;
1639 case ir_unop_ceil: {
1640 src_reg tmp = src_reg(this, ir->type);
1641 op[0].negate = !op[0].negate;
1642 emit(RNDD(dst_reg(tmp), op[0]));
1643 tmp.negate = true;
1644 emit(MOV(result_dst, tmp));
1645 }
1646 break;
1647 case ir_unop_floor:
1648 inst = emit(RNDD(result_dst, op[0]));
1649 break;
1650 case ir_unop_fract:
1651 inst = emit(FRC(result_dst, op[0]));
1652 break;
1653 case ir_unop_round_even:
1654 emit(RNDE(result_dst, op[0]));
1655 break;
1656
1657 case ir_binop_min:
1658 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1659 break;
1660 case ir_binop_max:
1661 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1662 break;
1663
1664 case ir_binop_pow:
1665 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1666 break;
1667
1668 case ir_unop_bit_not:
1669 inst = emit(NOT(result_dst, op[0]));
1670 break;
1671 case ir_binop_bit_and:
1672 inst = emit(AND(result_dst, op[0], op[1]));
1673 break;
1674 case ir_binop_bit_xor:
1675 inst = emit(XOR(result_dst, op[0], op[1]));
1676 break;
1677 case ir_binop_bit_or:
1678 inst = emit(OR(result_dst, op[0], op[1]));
1679 break;
1680
1681 case ir_binop_lshift:
1682 inst = emit(SHL(result_dst, op[0], op[1]));
1683 break;
1684
1685 case ir_binop_rshift:
1686 if (ir->type->base_type == GLSL_TYPE_INT)
1687 inst = emit(ASR(result_dst, op[0], op[1]));
1688 else
1689 inst = emit(SHR(result_dst, op[0], op[1]));
1690 break;
1691
1692 case ir_binop_bfm:
1693 emit(BFI1(result_dst, op[0], op[1]));
1694 break;
1695
1696 case ir_binop_ubo_load: {
1697 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1698 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1699 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1700 src_reg offset;
1701
1702 /* Now, load the vector from that offset. */
1703 assert(ir->type->is_vector() || ir->type->is_scalar());
1704
1705 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1706 packed_consts.type = result.type;
1707 src_reg surf_index;
1708
1709 if (const_uniform_block) {
1710 /* The block index is a constant, so just emit the binding table entry
1711 * as an immediate.
1712 */
1713 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1714 const_uniform_block->value.u[0]);
1715 } else {
1716 /* The block index is not a constant. Evaluate the index expression
1717 * per-channel and add the base UBO index; the generator will select
1718 * a value from any live channel.
1719 */
1720 surf_index = src_reg(this, glsl_type::uint_type);
1721 emit(ADD(dst_reg(surf_index), op[0],
1722 src_reg(prog_data->base.binding_table.ubo_start)));
1723
1724 /* Assume this may touch any UBO. It would be nice to provide
1725 * a tighter bound, but the array information is already lowered away.
1726 */
1727 brw_mark_surface_used(&prog_data->base,
1728 prog_data->base.binding_table.ubo_start +
1729 shader_prog->NumUniformBlocks - 1);
1730 }
1731
1732 if (const_offset_ir) {
1733 if (brw->gen >= 8) {
1734 /* Store the offset in a GRF so we can send-from-GRF. */
1735 offset = src_reg(this, glsl_type::int_type);
1736 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1737 } else {
1738 /* Immediates are fine on older generations since they'll be moved
1739 * to a (potentially fake) MRF at the generator level.
1740 */
1741 offset = src_reg(const_offset / 16);
1742 }
1743 } else {
1744 offset = src_reg(this, glsl_type::uint_type);
1745 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1746 }
1747
1748 if (brw->gen >= 7) {
1749 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1750 grf_offset.type = offset.type;
1751
1752 emit(MOV(grf_offset, offset));
1753
1754 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1755 dst_reg(packed_consts),
1756 surf_index,
1757 src_reg(grf_offset)));
1758 } else {
1759 vec4_instruction *pull =
1760 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1761 dst_reg(packed_consts),
1762 surf_index,
1763 offset));
1764 pull->base_mrf = 14;
1765 pull->mlen = 1;
1766 }
1767
1768 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1769 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1770 const_offset % 16 / 4,
1771 const_offset % 16 / 4,
1772 const_offset % 16 / 4);
1773
1774 /* UBO bools are any nonzero int. We need to convert them to use the
1775 * value of true stored in ctx->Const.UniformBooleanTrue.
1776 */
1777 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1778 emit(CMP(result_dst, packed_consts, src_reg(0u),
1779 BRW_CONDITIONAL_NZ));
1780 } else {
1781 emit(MOV(result_dst, packed_consts));
1782 }
1783 break;
1784 }
1785
1786 case ir_binop_vector_extract:
1787 unreachable("should have been lowered by vec_index_to_cond_assign");
1788
1789 case ir_triop_fma:
1790 op[0] = fix_3src_operand(op[0]);
1791 op[1] = fix_3src_operand(op[1]);
1792 op[2] = fix_3src_operand(op[2]);
1793 /* Note that the instruction's argument order is reversed from GLSL
1794 * and the IR.
1795 */
1796 emit(MAD(result_dst, op[2], op[1], op[0]));
1797 break;
1798
1799 case ir_triop_lrp:
1800 emit_lrp(result_dst, op[0], op[1], op[2]);
1801 break;
1802
1803 case ir_triop_csel:
1804 unreachable("already handled above");
1805 break;
1806
1807 case ir_triop_bfi:
1808 op[0] = fix_3src_operand(op[0]);
1809 op[1] = fix_3src_operand(op[1]);
1810 op[2] = fix_3src_operand(op[2]);
1811 emit(BFI2(result_dst, op[0], op[1], op[2]));
1812 break;
1813
1814 case ir_triop_bitfield_extract:
1815 op[0] = fix_3src_operand(op[0]);
1816 op[1] = fix_3src_operand(op[1]);
1817 op[2] = fix_3src_operand(op[2]);
1818 /* Note that the instruction's argument order is reversed from GLSL
1819 * and the IR.
1820 */
1821 emit(BFE(result_dst, op[2], op[1], op[0]));
1822 break;
1823
1824 case ir_triop_vector_insert:
1825 unreachable("should have been lowered by lower_vector_insert");
1826
1827 case ir_quadop_bitfield_insert:
1828 unreachable("not reached: should be handled by "
1829 "bitfield_insert_to_bfm_bfi\n");
1830
1831 case ir_quadop_vector:
1832 unreachable("not reached: should be handled by lower_quadop_vector");
1833
1834 case ir_unop_pack_half_2x16:
1835 emit_pack_half_2x16(result_dst, op[0]);
1836 break;
1837 case ir_unop_unpack_half_2x16:
1838 emit_unpack_half_2x16(result_dst, op[0]);
1839 break;
1840 case ir_unop_unpack_unorm_4x8:
1841 emit_unpack_unorm_4x8(result_dst, op[0]);
1842 break;
1843 case ir_unop_unpack_snorm_4x8:
1844 emit_unpack_snorm_4x8(result_dst, op[0]);
1845 break;
1846 case ir_unop_pack_unorm_4x8:
1847 emit_pack_unorm_4x8(result_dst, op[0]);
1848 break;
1849 case ir_unop_pack_snorm_4x8:
1850 emit_pack_snorm_4x8(result_dst, op[0]);
1851 break;
1852 case ir_unop_pack_snorm_2x16:
1853 case ir_unop_pack_unorm_2x16:
1854 case ir_unop_unpack_snorm_2x16:
1855 case ir_unop_unpack_unorm_2x16:
1856 unreachable("not reached: should be handled by lower_packing_builtins");
1857 case ir_unop_unpack_half_2x16_split_x:
1858 case ir_unop_unpack_half_2x16_split_y:
1859 case ir_binop_pack_half_2x16_split:
1860 case ir_unop_interpolate_at_centroid:
1861 case ir_binop_interpolate_at_sample:
1862 case ir_binop_interpolate_at_offset:
1863 unreachable("not reached: should not occur in vertex shader");
1864 case ir_binop_ldexp:
1865 unreachable("not reached: should be handled by ldexp_to_arith()");
1866 }
1867 }
1868
1869
1870 void
1871 vec4_visitor::visit(ir_swizzle *ir)
1872 {
1873 src_reg src;
1874 int i = 0;
1875 int swizzle[4];
1876
1877 /* Note that this is only swizzles in expressions, not those on the left
1878 * hand side of an assignment, which do write masking. See ir_assignment
1879 * for that.
1880 */
1881
1882 ir->val->accept(this);
1883 src = this->result;
1884 assert(src.file != BAD_FILE);
1885
1886 for (i = 0; i < ir->type->vector_elements; i++) {
1887 switch (i) {
1888 case 0:
1889 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1890 break;
1891 case 1:
1892 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1893 break;
1894 case 2:
1895 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1896 break;
1897 case 3:
1898 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1899 break;
1900 }
1901 }
1902 for (; i < 4; i++) {
1903 /* Replicate the last channel out. */
1904 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1905 }
1906
1907 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1908
1909 this->result = src;
1910 }
1911
1912 void
1913 vec4_visitor::visit(ir_dereference_variable *ir)
1914 {
1915 const struct glsl_type *type = ir->type;
1916 dst_reg *reg = variable_storage(ir->var);
1917
1918 if (!reg) {
1919 fail("Failed to find variable storage for %s\n", ir->var->name);
1920 this->result = src_reg(brw_null_reg());
1921 return;
1922 }
1923
1924 this->result = src_reg(*reg);
1925
1926 /* System values get their swizzle from the dst_reg writemask */
1927 if (ir->var->data.mode == ir_var_system_value)
1928 return;
1929
1930 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1931 this->result.swizzle = swizzle_for_size(type->vector_elements);
1932 }
1933
1934
1935 int
1936 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1937 {
1938 /* Under normal circumstances array elements are stored consecutively, so
1939 * the stride is equal to the size of the array element.
1940 */
1941 return type_size(ir->type);
1942 }
1943
1944
1945 void
1946 vec4_visitor::visit(ir_dereference_array *ir)
1947 {
1948 ir_constant *constant_index;
1949 src_reg src;
1950 int array_stride = compute_array_stride(ir);
1951
1952 constant_index = ir->array_index->constant_expression_value();
1953
1954 ir->array->accept(this);
1955 src = this->result;
1956
1957 if (constant_index) {
1958 src.reg_offset += constant_index->value.i[0] * array_stride;
1959 } else {
1960 /* Variable index array dereference. It eats the "vec4" of the
1961 * base of the array and an index that offsets the Mesa register
1962 * index.
1963 */
1964 ir->array_index->accept(this);
1965
1966 src_reg index_reg;
1967
1968 if (array_stride == 1) {
1969 index_reg = this->result;
1970 } else {
1971 index_reg = src_reg(this, glsl_type::int_type);
1972
1973 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1974 }
1975
1976 if (src.reladdr) {
1977 src_reg temp = src_reg(this, glsl_type::int_type);
1978
1979 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1980
1981 index_reg = temp;
1982 }
1983
1984 src.reladdr = ralloc(mem_ctx, src_reg);
1985 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1986 }
1987
1988 /* If the type is smaller than a vec4, replicate the last channel out. */
1989 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1990 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1991 else
1992 src.swizzle = BRW_SWIZZLE_NOOP;
1993 src.type = brw_type_for_base_type(ir->type);
1994
1995 this->result = src;
1996 }
1997
1998 void
1999 vec4_visitor::visit(ir_dereference_record *ir)
2000 {
2001 unsigned int i;
2002 const glsl_type *struct_type = ir->record->type;
2003 int offset = 0;
2004
2005 ir->record->accept(this);
2006
2007 for (i = 0; i < struct_type->length; i++) {
2008 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2009 break;
2010 offset += type_size(struct_type->fields.structure[i].type);
2011 }
2012
2013 /* If the type is smaller than a vec4, replicate the last channel out. */
2014 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2015 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2016 else
2017 this->result.swizzle = BRW_SWIZZLE_NOOP;
2018 this->result.type = brw_type_for_base_type(ir->type);
2019
2020 this->result.reg_offset += offset;
2021 }
2022
2023 /**
2024 * We want to be careful in assignment setup to hit the actual storage
2025 * instead of potentially using a temporary like we might with the
2026 * ir_dereference handler.
2027 */
2028 static dst_reg
2029 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2030 {
2031 /* The LHS must be a dereference. If the LHS is a variable indexed array
2032 * access of a vector, it must be separated into a series conditional moves
2033 * before reaching this point (see ir_vec_index_to_cond_assign).
2034 */
2035 assert(ir->as_dereference());
2036 ir_dereference_array *deref_array = ir->as_dereference_array();
2037 if (deref_array) {
2038 assert(!deref_array->array->type->is_vector());
2039 }
2040
2041 /* Use the rvalue deref handler for the most part. We'll ignore
2042 * swizzles in it and write swizzles using writemask, though.
2043 */
2044 ir->accept(v);
2045 return dst_reg(v->result);
2046 }
2047
2048 void
2049 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2050 const struct glsl_type *type,
2051 enum brw_predicate predicate)
2052 {
2053 if (type->base_type == GLSL_TYPE_STRUCT) {
2054 for (unsigned int i = 0; i < type->length; i++) {
2055 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2056 }
2057 return;
2058 }
2059
2060 if (type->is_array()) {
2061 for (unsigned int i = 0; i < type->length; i++) {
2062 emit_block_move(dst, src, type->fields.array, predicate);
2063 }
2064 return;
2065 }
2066
2067 if (type->is_matrix()) {
2068 const struct glsl_type *vec_type;
2069
2070 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2071 type->vector_elements, 1);
2072
2073 for (int i = 0; i < type->matrix_columns; i++) {
2074 emit_block_move(dst, src, vec_type, predicate);
2075 }
2076 return;
2077 }
2078
2079 assert(type->is_scalar() || type->is_vector());
2080
2081 dst->type = brw_type_for_base_type(type);
2082 src->type = dst->type;
2083
2084 dst->writemask = (1 << type->vector_elements) - 1;
2085
2086 src->swizzle = swizzle_for_size(type->vector_elements);
2087
2088 vec4_instruction *inst = emit(MOV(*dst, *src));
2089 inst->predicate = predicate;
2090
2091 dst->reg_offset++;
2092 src->reg_offset++;
2093 }
2094
2095
2096 /* If the RHS processing resulted in an instruction generating a
2097 * temporary value, and it would be easy to rewrite the instruction to
2098 * generate its result right into the LHS instead, do so. This ends
2099 * up reliably removing instructions where it can be tricky to do so
2100 * later without real UD chain information.
2101 */
2102 bool
2103 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2104 dst_reg dst,
2105 src_reg src,
2106 vec4_instruction *pre_rhs_inst,
2107 vec4_instruction *last_rhs_inst)
2108 {
2109 /* This could be supported, but it would take more smarts. */
2110 if (ir->condition)
2111 return false;
2112
2113 if (pre_rhs_inst == last_rhs_inst)
2114 return false; /* No instructions generated to work with. */
2115
2116 /* Make sure the last instruction generated our source reg. */
2117 if (src.file != GRF ||
2118 src.file != last_rhs_inst->dst.file ||
2119 src.reg != last_rhs_inst->dst.reg ||
2120 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2121 src.reladdr ||
2122 src.abs ||
2123 src.negate ||
2124 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2125 return false;
2126
2127 /* Check that that last instruction fully initialized the channels
2128 * we want to use, in the order we want to use them. We could
2129 * potentially reswizzle the operands of many instructions so that
2130 * we could handle out of order channels, but don't yet.
2131 */
2132
2133 for (unsigned i = 0; i < 4; i++) {
2134 if (dst.writemask & (1 << i)) {
2135 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2136 return false;
2137
2138 if (BRW_GET_SWZ(src.swizzle, i) != i)
2139 return false;
2140 }
2141 }
2142
2143 /* Success! Rewrite the instruction. */
2144 last_rhs_inst->dst.file = dst.file;
2145 last_rhs_inst->dst.reg = dst.reg;
2146 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2147 last_rhs_inst->dst.reladdr = dst.reladdr;
2148 last_rhs_inst->dst.writemask &= dst.writemask;
2149
2150 return true;
2151 }
2152
2153 void
2154 vec4_visitor::visit(ir_assignment *ir)
2155 {
2156 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2157 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2158
2159 if (!ir->lhs->type->is_scalar() &&
2160 !ir->lhs->type->is_vector()) {
2161 ir->rhs->accept(this);
2162 src_reg src = this->result;
2163
2164 if (ir->condition) {
2165 emit_bool_to_cond_code(ir->condition, &predicate);
2166 }
2167
2168 /* emit_block_move doesn't account for swizzles in the source register.
2169 * This should be ok, since the source register is a structure or an
2170 * array, and those can't be swizzled. But double-check to be sure.
2171 */
2172 assert(src.swizzle ==
2173 (ir->rhs->type->is_matrix()
2174 ? swizzle_for_size(ir->rhs->type->vector_elements)
2175 : BRW_SWIZZLE_NOOP));
2176
2177 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2178 return;
2179 }
2180
2181 /* Now we're down to just a scalar/vector with writemasks. */
2182 int i;
2183
2184 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2185 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2186
2187 ir->rhs->accept(this);
2188
2189 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2190
2191 src_reg src = this->result;
2192
2193 int swizzles[4];
2194 int first_enabled_chan = 0;
2195 int src_chan = 0;
2196
2197 assert(ir->lhs->type->is_vector() ||
2198 ir->lhs->type->is_scalar());
2199 dst.writemask = ir->write_mask;
2200
2201 for (int i = 0; i < 4; i++) {
2202 if (dst.writemask & (1 << i)) {
2203 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2204 break;
2205 }
2206 }
2207
2208 /* Swizzle a small RHS vector into the channels being written.
2209 *
2210 * glsl ir treats write_mask as dictating how many channels are
2211 * present on the RHS while in our instructions we need to make
2212 * those channels appear in the slots of the vec4 they're written to.
2213 */
2214 for (int i = 0; i < 4; i++) {
2215 if (dst.writemask & (1 << i))
2216 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2217 else
2218 swizzles[i] = first_enabled_chan;
2219 }
2220 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2221 swizzles[2], swizzles[3]);
2222
2223 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2224 return;
2225 }
2226
2227 if (ir->condition) {
2228 emit_bool_to_cond_code(ir->condition, &predicate);
2229 }
2230
2231 for (i = 0; i < type_size(ir->lhs->type); i++) {
2232 vec4_instruction *inst = emit(MOV(dst, src));
2233 inst->predicate = predicate;
2234
2235 dst.reg_offset++;
2236 src.reg_offset++;
2237 }
2238 }
2239
2240 void
2241 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2242 {
2243 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2244 foreach_in_list(ir_constant, field_value, &ir->components) {
2245 emit_constant_values(dst, field_value);
2246 }
2247 return;
2248 }
2249
2250 if (ir->type->is_array()) {
2251 for (unsigned int i = 0; i < ir->type->length; i++) {
2252 emit_constant_values(dst, ir->array_elements[i]);
2253 }
2254 return;
2255 }
2256
2257 if (ir->type->is_matrix()) {
2258 for (int i = 0; i < ir->type->matrix_columns; i++) {
2259 float *vec = &ir->value.f[i * ir->type->vector_elements];
2260
2261 for (int j = 0; j < ir->type->vector_elements; j++) {
2262 dst->writemask = 1 << j;
2263 dst->type = BRW_REGISTER_TYPE_F;
2264
2265 emit(MOV(*dst, src_reg(vec[j])));
2266 }
2267 dst->reg_offset++;
2268 }
2269 return;
2270 }
2271
2272 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2273
2274 for (int i = 0; i < ir->type->vector_elements; i++) {
2275 if (!(remaining_writemask & (1 << i)))
2276 continue;
2277
2278 dst->writemask = 1 << i;
2279 dst->type = brw_type_for_base_type(ir->type);
2280
2281 /* Find other components that match the one we're about to
2282 * write. Emits fewer instructions for things like vec4(0.5,
2283 * 1.5, 1.5, 1.5).
2284 */
2285 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2286 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2287 if (ir->value.b[i] == ir->value.b[j])
2288 dst->writemask |= (1 << j);
2289 } else {
2290 /* u, i, and f storage all line up, so no need for a
2291 * switch case for comparing each type.
2292 */
2293 if (ir->value.u[i] == ir->value.u[j])
2294 dst->writemask |= (1 << j);
2295 }
2296 }
2297
2298 switch (ir->type->base_type) {
2299 case GLSL_TYPE_FLOAT:
2300 emit(MOV(*dst, src_reg(ir->value.f[i])));
2301 break;
2302 case GLSL_TYPE_INT:
2303 emit(MOV(*dst, src_reg(ir->value.i[i])));
2304 break;
2305 case GLSL_TYPE_UINT:
2306 emit(MOV(*dst, src_reg(ir->value.u[i])));
2307 break;
2308 case GLSL_TYPE_BOOL:
2309 emit(MOV(*dst,
2310 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2311 : 0)));
2312 break;
2313 default:
2314 unreachable("Non-float/uint/int/bool constant");
2315 }
2316
2317 remaining_writemask &= ~dst->writemask;
2318 }
2319 dst->reg_offset++;
2320 }
2321
2322 void
2323 vec4_visitor::visit(ir_constant *ir)
2324 {
2325 dst_reg dst = dst_reg(this, ir->type);
2326 this->result = src_reg(dst);
2327
2328 emit_constant_values(&dst, ir);
2329 }
2330
2331 void
2332 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2333 {
2334 ir_dereference *deref = static_cast<ir_dereference *>(
2335 ir->actual_parameters.get_head());
2336 ir_variable *location = deref->variable_referenced();
2337 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2338 location->data.binding);
2339
2340 /* Calculate the surface offset */
2341 src_reg offset(this, glsl_type::uint_type);
2342 ir_dereference_array *deref_array = deref->as_dereference_array();
2343 if (deref_array) {
2344 deref_array->array_index->accept(this);
2345
2346 src_reg tmp(this, glsl_type::uint_type);
2347 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2348 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2349 } else {
2350 offset = location->data.atomic.offset;
2351 }
2352
2353 /* Emit the appropriate machine instruction */
2354 const char *callee = ir->callee->function_name();
2355 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2356
2357 if (!strcmp("__intrinsic_atomic_read", callee)) {
2358 emit_untyped_surface_read(surf_index, dst, offset);
2359
2360 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2361 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2362 src_reg(), src_reg());
2363
2364 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2365 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2366 src_reg(), src_reg());
2367 }
2368 }
2369
2370 void
2371 vec4_visitor::visit(ir_call *ir)
2372 {
2373 const char *callee = ir->callee->function_name();
2374
2375 if (!strcmp("__intrinsic_atomic_read", callee) ||
2376 !strcmp("__intrinsic_atomic_increment", callee) ||
2377 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2378 visit_atomic_counter_intrinsic(ir);
2379 } else {
2380 unreachable("Unsupported intrinsic.");
2381 }
2382 }
2383
2384 src_reg
2385 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2386 {
2387 vec4_instruction *inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS);
2388 inst->base_mrf = 2;
2389 inst->mlen = 1;
2390 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2391 inst->dst.writemask = WRITEMASK_XYZW;
2392
2393 inst->src[1] = sampler;
2394
2395 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2396 int param_base = inst->base_mrf;
2397 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2398 int zero_mask = 0xf & ~coord_mask;
2399
2400 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2401 coordinate));
2402
2403 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2404 src_reg(0)));
2405
2406 emit(inst);
2407 return src_reg(inst->dst);
2408 }
2409
2410 static bool
2411 is_high_sampler(struct brw_context *brw, src_reg sampler)
2412 {
2413 if (brw->gen < 8 && !brw->is_haswell)
2414 return false;
2415
2416 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2417 }
2418
2419 void
2420 vec4_visitor::visit(ir_texture *ir)
2421 {
2422 uint32_t sampler =
2423 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2424
2425 ir_rvalue *nonconst_sampler_index =
2426 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2427
2428 /* Handle non-constant sampler array indexing */
2429 src_reg sampler_reg;
2430 if (nonconst_sampler_index) {
2431 /* The highest sampler which may be used by this operation is
2432 * the last element of the array. Mark it here, because the generator
2433 * doesn't have enough information to determine the bound.
2434 */
2435 uint32_t array_size = ir->sampler->as_dereference_array()
2436 ->array->type->array_size();
2437
2438 uint32_t max_used = sampler + array_size - 1;
2439 if (ir->op == ir_tg4 && brw->gen < 8) {
2440 max_used += prog_data->base.binding_table.gather_texture_start;
2441 } else {
2442 max_used += prog_data->base.binding_table.texture_start;
2443 }
2444
2445 brw_mark_surface_used(&prog_data->base, max_used);
2446
2447 /* Emit code to evaluate the actual indexing expression */
2448 nonconst_sampler_index->accept(this);
2449 dst_reg temp(this, glsl_type::uint_type);
2450 emit(ADD(temp, this->result, src_reg(sampler)))
2451 ->force_writemask_all = true;
2452 sampler_reg = src_reg(temp);
2453 } else {
2454 /* Single sampler, or constant array index; the indexing expression
2455 * is just an immediate.
2456 */
2457 sampler_reg = src_reg(sampler);
2458 }
2459
2460 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2461 * emitting anything other than setting up the constant result.
2462 */
2463 if (ir->op == ir_tg4) {
2464 ir_constant *chan = ir->lod_info.component->as_constant();
2465 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2466 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2467 dst_reg result(this, ir->type);
2468 this->result = src_reg(result);
2469 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2470 return;
2471 }
2472 }
2473
2474 /* Should be lowered by do_lower_texture_projection */
2475 assert(!ir->projector);
2476
2477 /* Should be lowered */
2478 assert(!ir->offset || !ir->offset->type->is_array());
2479
2480 /* Generate code to compute all the subexpression trees. This has to be
2481 * done before loading any values into MRFs for the sampler message since
2482 * generating these values may involve SEND messages that need the MRFs.
2483 */
2484 src_reg coordinate;
2485 if (ir->coordinate) {
2486 ir->coordinate->accept(this);
2487 coordinate = this->result;
2488 }
2489
2490 src_reg shadow_comparitor;
2491 if (ir->shadow_comparitor) {
2492 ir->shadow_comparitor->accept(this);
2493 shadow_comparitor = this->result;
2494 }
2495
2496 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2497 src_reg offset_value;
2498 if (has_nonconstant_offset) {
2499 ir->offset->accept(this);
2500 offset_value = src_reg(this->result);
2501 }
2502
2503 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2504 src_reg lod, dPdx, dPdy, sample_index, mcs;
2505 switch (ir->op) {
2506 case ir_tex:
2507 lod = src_reg(0.0f);
2508 lod_type = glsl_type::float_type;
2509 break;
2510 case ir_txf:
2511 case ir_txl:
2512 case ir_txs:
2513 ir->lod_info.lod->accept(this);
2514 lod = this->result;
2515 lod_type = ir->lod_info.lod->type;
2516 break;
2517 case ir_query_levels:
2518 lod = src_reg(0);
2519 lod_type = glsl_type::int_type;
2520 break;
2521 case ir_txf_ms:
2522 ir->lod_info.sample_index->accept(this);
2523 sample_index = this->result;
2524 sample_index_type = ir->lod_info.sample_index->type;
2525
2526 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2527 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2528 else
2529 mcs = src_reg(0u);
2530 break;
2531 case ir_txd:
2532 ir->lod_info.grad.dPdx->accept(this);
2533 dPdx = this->result;
2534
2535 ir->lod_info.grad.dPdy->accept(this);
2536 dPdy = this->result;
2537
2538 lod_type = ir->lod_info.grad.dPdx->type;
2539 break;
2540 case ir_txb:
2541 case ir_lod:
2542 case ir_tg4:
2543 break;
2544 }
2545
2546 enum opcode opcode;
2547 switch (ir->op) {
2548 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2549 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2550 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2551 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2552 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2553 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2554 case ir_tg4: opcode = has_nonconstant_offset
2555 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2556 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2557 case ir_txb:
2558 unreachable("TXB is not valid for vertex shaders.");
2559 case ir_lod:
2560 unreachable("LOD is not valid for vertex shaders.");
2561 default:
2562 unreachable("Unrecognized tex op");
2563 }
2564
2565 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode);
2566
2567 if (ir->offset != NULL && !has_nonconstant_offset) {
2568 inst->offset =
2569 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2570 ir->offset->type->vector_elements);
2571 }
2572
2573 /* Stuff the channel select bits in the top of the texture offset */
2574 if (ir->op == ir_tg4)
2575 inst->offset |= gather_channel(ir, sampler) << 16;
2576
2577 /* The message header is necessary for:
2578 * - Gen4 (always)
2579 * - Gen9+ for selecting SIMD4x2
2580 * - Texel offsets
2581 * - Gather channel selection
2582 * - Sampler indices too large to fit in a 4-bit value.
2583 */
2584 inst->header_present =
2585 brw->gen < 5 || brw->gen >= 9 ||
2586 inst->offset != 0 || ir->op == ir_tg4 ||
2587 is_high_sampler(brw, sampler_reg);
2588 inst->base_mrf = 2;
2589 inst->mlen = inst->header_present + 1; /* always at least one */
2590 inst->dst = dst_reg(this, ir->type);
2591 inst->dst.writemask = WRITEMASK_XYZW;
2592 inst->shadow_compare = ir->shadow_comparitor != NULL;
2593
2594 inst->src[1] = sampler_reg;
2595
2596 /* MRF for the first parameter */
2597 int param_base = inst->base_mrf + inst->header_present;
2598
2599 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2600 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2601 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2602 } else {
2603 /* Load the coordinate */
2604 /* FINISHME: gl_clamp_mask and saturate */
2605 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2606 int zero_mask = 0xf & ~coord_mask;
2607
2608 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2609 coordinate));
2610
2611 if (zero_mask != 0) {
2612 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2613 src_reg(0)));
2614 }
2615 /* Load the shadow comparitor */
2616 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2617 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2618 WRITEMASK_X),
2619 shadow_comparitor));
2620 inst->mlen++;
2621 }
2622
2623 /* Load the LOD info */
2624 if (ir->op == ir_tex || ir->op == ir_txl) {
2625 int mrf, writemask;
2626 if (brw->gen >= 5) {
2627 mrf = param_base + 1;
2628 if (ir->shadow_comparitor) {
2629 writemask = WRITEMASK_Y;
2630 /* mlen already incremented */
2631 } else {
2632 writemask = WRITEMASK_X;
2633 inst->mlen++;
2634 }
2635 } else /* brw->gen == 4 */ {
2636 mrf = param_base;
2637 writemask = WRITEMASK_W;
2638 }
2639 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2640 } else if (ir->op == ir_txf) {
2641 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2642 } else if (ir->op == ir_txf_ms) {
2643 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2644 sample_index));
2645 if (brw->gen >= 7) {
2646 /* MCS data is in the first channel of `mcs`, but we need to get it into
2647 * the .y channel of the second vec4 of params, so replicate .x across
2648 * the whole vec4 and then mask off everything except .y
2649 */
2650 mcs.swizzle = BRW_SWIZZLE_XXXX;
2651 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2652 mcs));
2653 }
2654 inst->mlen++;
2655 } else if (ir->op == ir_txd) {
2656 const glsl_type *type = lod_type;
2657
2658 if (brw->gen >= 5) {
2659 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2661 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2662 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2663 inst->mlen++;
2664
2665 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2666 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2667 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2668 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2669 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2670 inst->mlen++;
2671
2672 if (ir->shadow_comparitor) {
2673 emit(MOV(dst_reg(MRF, param_base + 2,
2674 ir->shadow_comparitor->type, WRITEMASK_Z),
2675 shadow_comparitor));
2676 }
2677 }
2678 } else /* brw->gen == 4 */ {
2679 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2680 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2681 inst->mlen += 2;
2682 }
2683 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2684 if (ir->shadow_comparitor) {
2685 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2686 shadow_comparitor));
2687 }
2688
2689 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2690 offset_value));
2691 inst->mlen++;
2692 }
2693 }
2694
2695 emit(inst);
2696
2697 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2698 * spec requires layers.
2699 */
2700 if (ir->op == ir_txs) {
2701 glsl_type const *type = ir->sampler->type;
2702 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2703 type->sampler_array) {
2704 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2705 writemask(inst->dst, WRITEMASK_Z),
2706 src_reg(inst->dst), src_reg(6));
2707 }
2708 }
2709
2710 if (brw->gen == 6 && ir->op == ir_tg4) {
2711 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2712 }
2713
2714 swizzle_result(ir, src_reg(inst->dst), sampler);
2715 }
2716
2717 /**
2718 * Apply workarounds for Gen6 gather with UINT/SINT
2719 */
2720 void
2721 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2722 {
2723 if (!wa)
2724 return;
2725
2726 int width = (wa & WA_8BIT) ? 8 : 16;
2727 dst_reg dst_f = dst;
2728 dst_f.type = BRW_REGISTER_TYPE_F;
2729
2730 /* Convert from UNORM to UINT */
2731 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2732 emit(MOV(dst, src_reg(dst_f)));
2733
2734 if (wa & WA_SIGN) {
2735 /* Reinterpret the UINT value as a signed INT value by
2736 * shifting the sign bit into place, then shifting back
2737 * preserving sign.
2738 */
2739 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2740 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2741 }
2742 }
2743
2744 /**
2745 * Set up the gather channel based on the swizzle, for gather4.
2746 */
2747 uint32_t
2748 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2749 {
2750 ir_constant *chan = ir->lod_info.component->as_constant();
2751 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2752 switch (swiz) {
2753 case SWIZZLE_X: return 0;
2754 case SWIZZLE_Y:
2755 /* gather4 sampler is broken for green channel on RG32F --
2756 * we must ask for blue instead.
2757 */
2758 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2759 return 2;
2760 return 1;
2761 case SWIZZLE_Z: return 2;
2762 case SWIZZLE_W: return 3;
2763 default:
2764 unreachable("Not reached"); /* zero, one swizzles handled already */
2765 }
2766 }
2767
2768 void
2769 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2770 {
2771 int s = key->tex.swizzles[sampler];
2772
2773 this->result = src_reg(this, ir->type);
2774 dst_reg swizzled_result(this->result);
2775
2776 if (ir->op == ir_query_levels) {
2777 /* # levels is in .w */
2778 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2779 emit(MOV(swizzled_result, orig_val));
2780 return;
2781 }
2782
2783 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2784 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2785 emit(MOV(swizzled_result, orig_val));
2786 return;
2787 }
2788
2789
2790 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2791 int swizzle[4] = {0};
2792
2793 for (int i = 0; i < 4; i++) {
2794 switch (GET_SWZ(s, i)) {
2795 case SWIZZLE_ZERO:
2796 zero_mask |= (1 << i);
2797 break;
2798 case SWIZZLE_ONE:
2799 one_mask |= (1 << i);
2800 break;
2801 default:
2802 copy_mask |= (1 << i);
2803 swizzle[i] = GET_SWZ(s, i);
2804 break;
2805 }
2806 }
2807
2808 if (copy_mask) {
2809 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2810 swizzled_result.writemask = copy_mask;
2811 emit(MOV(swizzled_result, orig_val));
2812 }
2813
2814 if (zero_mask) {
2815 swizzled_result.writemask = zero_mask;
2816 emit(MOV(swizzled_result, src_reg(0.0f)));
2817 }
2818
2819 if (one_mask) {
2820 swizzled_result.writemask = one_mask;
2821 emit(MOV(swizzled_result, src_reg(1.0f)));
2822 }
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_return *)
2827 {
2828 unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::visit(ir_discard *)
2833 {
2834 unreachable("not reached");
2835 }
2836
2837 void
2838 vec4_visitor::visit(ir_if *ir)
2839 {
2840 /* Don't point the annotation at the if statement, because then it plus
2841 * the then and else blocks get printed.
2842 */
2843 this->base_ir = ir->condition;
2844
2845 if (brw->gen == 6) {
2846 emit_if_gen6(ir);
2847 } else {
2848 enum brw_predicate predicate;
2849 emit_bool_to_cond_code(ir->condition, &predicate);
2850 emit(IF(predicate));
2851 }
2852
2853 visit_instructions(&ir->then_instructions);
2854
2855 if (!ir->else_instructions.is_empty()) {
2856 this->base_ir = ir->condition;
2857 emit(BRW_OPCODE_ELSE);
2858
2859 visit_instructions(&ir->else_instructions);
2860 }
2861
2862 this->base_ir = ir->condition;
2863 emit(BRW_OPCODE_ENDIF);
2864 }
2865
2866 void
2867 vec4_visitor::visit(ir_emit_vertex *)
2868 {
2869 unreachable("not reached");
2870 }
2871
2872 void
2873 vec4_visitor::visit(ir_end_primitive *)
2874 {
2875 unreachable("not reached");
2876 }
2877
2878 void
2879 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2880 dst_reg dst, src_reg offset,
2881 src_reg src0, src_reg src1)
2882 {
2883 unsigned mlen = 0;
2884
2885 /* Set the atomic operation offset. */
2886 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2887 mlen++;
2888
2889 /* Set the atomic operation arguments. */
2890 if (src0.file != BAD_FILE) {
2891 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2892 mlen++;
2893 }
2894
2895 if (src1.file != BAD_FILE) {
2896 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2897 mlen++;
2898 }
2899
2900 /* Emit the instruction. Note that this maps to the normal SIMD8
2901 * untyped atomic message on Ivy Bridge, but that's OK because
2902 * unused channels will be masked out.
2903 */
2904 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2905 src_reg(atomic_op), src_reg(surf_index));
2906 inst->base_mrf = 0;
2907 inst->mlen = mlen;
2908 }
2909
2910 void
2911 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2912 src_reg offset)
2913 {
2914 /* Set the surface read offset. */
2915 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2916
2917 /* Emit the instruction. Note that this maps to the normal SIMD8
2918 * untyped surface read message, but that's OK because unused
2919 * channels will be masked out.
2920 */
2921 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2922 dst, src_reg(surf_index));
2923 inst->base_mrf = 0;
2924 inst->mlen = 1;
2925 }
2926
2927 void
2928 vec4_visitor::emit_ndc_computation()
2929 {
2930 /* Get the position */
2931 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2932
2933 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2934 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2935 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2936
2937 current_annotation = "NDC";
2938 dst_reg ndc_w = ndc;
2939 ndc_w.writemask = WRITEMASK_W;
2940 src_reg pos_w = pos;
2941 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2942 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2943
2944 dst_reg ndc_xyz = ndc;
2945 ndc_xyz.writemask = WRITEMASK_XYZ;
2946
2947 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2948 }
2949
2950 void
2951 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2952 {
2953 if (brw->gen < 6 &&
2954 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2955 key->userclip_active || brw->has_negative_rhw_bug)) {
2956 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2957 dst_reg header1_w = header1;
2958 header1_w.writemask = WRITEMASK_W;
2959
2960 emit(MOV(header1, 0u));
2961
2962 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2963 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2964
2965 current_annotation = "Point size";
2966 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2967 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2968 }
2969
2970 if (key->userclip_active) {
2971 current_annotation = "Clipping flags";
2972 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2973 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2974
2975 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2976 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2977 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2978
2979 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2980 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2981 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2982 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2983 }
2984
2985 /* i965 clipping workaround:
2986 * 1) Test for -ve rhw
2987 * 2) If set,
2988 * set ndc = (0,0,0,0)
2989 * set ucp[6] = 1
2990 *
2991 * Later, clipping will detect ucp[6] and ensure the primitive is
2992 * clipped against all fixed planes.
2993 */
2994 if (brw->has_negative_rhw_bug) {
2995 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2996 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2997 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2998 vec4_instruction *inst;
2999 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3000 inst->predicate = BRW_PREDICATE_NORMAL;
3001 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3002 inst->predicate = BRW_PREDICATE_NORMAL;
3003 }
3004
3005 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3006 } else if (brw->gen < 6) {
3007 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3008 } else {
3009 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3010 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3011 dst_reg reg_w = reg;
3012 reg_w.writemask = WRITEMASK_W;
3013 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3014 }
3015 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3016 dst_reg reg_y = reg;
3017 reg_y.writemask = WRITEMASK_Y;
3018 reg_y.type = BRW_REGISTER_TYPE_D;
3019 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3020 }
3021 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3022 dst_reg reg_z = reg;
3023 reg_z.writemask = WRITEMASK_Z;
3024 reg_z.type = BRW_REGISTER_TYPE_D;
3025 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3026 }
3027 }
3028 }
3029
3030 void
3031 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3032 {
3033 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3034 *
3035 * "If a linked set of shaders forming the vertex stage contains no
3036 * static write to gl_ClipVertex or gl_ClipDistance, but the
3037 * application has requested clipping against user clip planes through
3038 * the API, then the coordinate written to gl_Position is used for
3039 * comparison against the user clip planes."
3040 *
3041 * This function is only called if the shader didn't write to
3042 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3043 * if the user wrote to it; otherwise we use gl_Position.
3044 */
3045 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3046 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3047 clip_vertex = VARYING_SLOT_POS;
3048 }
3049
3050 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3051 ++i) {
3052 reg.writemask = 1 << i;
3053 emit(DP4(reg,
3054 src_reg(output_reg[clip_vertex]),
3055 src_reg(this->userplane[i + offset])));
3056 }
3057 }
3058
3059 vec4_instruction *
3060 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3061 {
3062 assert (varying < VARYING_SLOT_MAX);
3063 reg.type = output_reg[varying].type;
3064 current_annotation = output_reg_annotation[varying];
3065 /* Copy the register, saturating if necessary */
3066 return emit(MOV(reg, src_reg(output_reg[varying])));
3067 }
3068
3069 void
3070 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3071 {
3072 reg.type = BRW_REGISTER_TYPE_F;
3073
3074 switch (varying) {
3075 case VARYING_SLOT_PSIZ:
3076 {
3077 /* PSIZ is always in slot 0, and is coupled with other flags. */
3078 current_annotation = "indices, point width, clip flags";
3079 emit_psiz_and_flags(reg);
3080 break;
3081 }
3082 case BRW_VARYING_SLOT_NDC:
3083 current_annotation = "NDC";
3084 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3085 break;
3086 case VARYING_SLOT_POS:
3087 current_annotation = "gl_Position";
3088 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3089 break;
3090 case VARYING_SLOT_EDGE:
3091 /* This is present when doing unfilled polygons. We're supposed to copy
3092 * the edge flag from the user-provided vertex array
3093 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3094 * of that attribute (starts as 1.0f). This is then used in clipping to
3095 * determine which edges should be drawn as wireframe.
3096 */
3097 current_annotation = "edge flag";
3098 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3099 glsl_type::float_type, WRITEMASK_XYZW))));
3100 break;
3101 case BRW_VARYING_SLOT_PAD:
3102 /* No need to write to this slot */
3103 break;
3104 case VARYING_SLOT_COL0:
3105 case VARYING_SLOT_COL1:
3106 case VARYING_SLOT_BFC0:
3107 case VARYING_SLOT_BFC1: {
3108 /* These built-in varyings are only supported in compatibility mode,
3109 * and we only support GS in core profile. So, this must be a vertex
3110 * shader.
3111 */
3112 assert(stage == MESA_SHADER_VERTEX);
3113 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3114 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3115 inst->saturate = true;
3116 break;
3117 }
3118
3119 default:
3120 emit_generic_urb_slot(reg, varying);
3121 break;
3122 }
3123 }
3124
3125 static int
3126 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3127 {
3128 if (brw->gen >= 6) {
3129 /* URB data written (does not include the message header reg) must
3130 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3131 * section 5.4.3.2.2: URB_INTERLEAVED.
3132 *
3133 * URB entries are allocated on a multiple of 1024 bits, so an
3134 * extra 128 bits written here to make the end align to 256 is
3135 * no problem.
3136 */
3137 if ((mlen % 2) != 1)
3138 mlen++;
3139 }
3140
3141 return mlen;
3142 }
3143
3144
3145 /**
3146 * Generates the VUE payload plus the necessary URB write instructions to
3147 * output it.
3148 *
3149 * The VUE layout is documented in Volume 2a.
3150 */
3151 void
3152 vec4_visitor::emit_vertex()
3153 {
3154 /* MRF 0 is reserved for the debugger, so start with message header
3155 * in MRF 1.
3156 */
3157 int base_mrf = 1;
3158 int mrf = base_mrf;
3159 /* In the process of generating our URB write message contents, we
3160 * may need to unspill a register or load from an array. Those
3161 * reads would use MRFs 14-15.
3162 */
3163 int max_usable_mrf = 13;
3164
3165 /* The following assertion verifies that max_usable_mrf causes an
3166 * even-numbered amount of URB write data, which will meet gen6's
3167 * requirements for length alignment.
3168 */
3169 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3170
3171 /* First mrf is the g0-based message header containing URB handles and
3172 * such.
3173 */
3174 emit_urb_write_header(mrf++);
3175
3176 if (brw->gen < 6) {
3177 emit_ndc_computation();
3178 }
3179
3180 /* Lower legacy ff and ClipVertex clipping to clip distances */
3181 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3182 current_annotation = "user clip distances";
3183
3184 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3185 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3186
3187 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3188 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3189 }
3190
3191 /* We may need to split this up into several URB writes, so do them in a
3192 * loop.
3193 */
3194 int slot = 0;
3195 bool complete = false;
3196 do {
3197 /* URB offset is in URB row increments, and each of our MRFs is half of
3198 * one of those, since we're doing interleaved writes.
3199 */
3200 int offset = slot / 2;
3201
3202 mrf = base_mrf + 1;
3203 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3204 emit_urb_slot(dst_reg(MRF, mrf++),
3205 prog_data->vue_map.slot_to_varying[slot]);
3206
3207 /* If this was max_usable_mrf, we can't fit anything more into this
3208 * URB WRITE.
3209 */
3210 if (mrf > max_usable_mrf) {
3211 slot++;
3212 break;
3213 }
3214 }
3215
3216 complete = slot >= prog_data->vue_map.num_slots;
3217 current_annotation = "URB write";
3218 vec4_instruction *inst = emit_urb_write_opcode(complete);
3219 inst->base_mrf = base_mrf;
3220 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3221 inst->offset += offset;
3222 } while(!complete);
3223 }
3224
3225
3226 src_reg
3227 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3228 src_reg *reladdr, int reg_offset)
3229 {
3230 /* Because we store the values to scratch interleaved like our
3231 * vertex data, we need to scale the vec4 index by 2.
3232 */
3233 int message_header_scale = 2;
3234
3235 /* Pre-gen6, the message header uses byte offsets instead of vec4
3236 * (16-byte) offset units.
3237 */
3238 if (brw->gen < 6)
3239 message_header_scale *= 16;
3240
3241 if (reladdr) {
3242 src_reg index = src_reg(this, glsl_type::int_type);
3243
3244 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3245 src_reg(reg_offset)));
3246 emit_before(block, inst, MUL(dst_reg(index), index,
3247 src_reg(message_header_scale)));
3248
3249 return index;
3250 } else {
3251 return src_reg(reg_offset * message_header_scale);
3252 }
3253 }
3254
3255 src_reg
3256 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3257 src_reg *reladdr, int reg_offset)
3258 {
3259 if (reladdr) {
3260 src_reg index = src_reg(this, glsl_type::int_type);
3261
3262 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3263 src_reg(reg_offset)));
3264
3265 /* Pre-gen6, the message header uses byte offsets instead of vec4
3266 * (16-byte) offset units.
3267 */
3268 if (brw->gen < 6) {
3269 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3270 }
3271
3272 return index;
3273 } else if (brw->gen >= 8) {
3274 /* Store the offset in a GRF so we can send-from-GRF. */
3275 src_reg offset = src_reg(this, glsl_type::int_type);
3276 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3277 return offset;
3278 } else {
3279 int message_header_scale = brw->gen < 6 ? 16 : 1;
3280 return src_reg(reg_offset * message_header_scale);
3281 }
3282 }
3283
3284 /**
3285 * Emits an instruction before @inst to load the value named by @orig_src
3286 * from scratch space at @base_offset to @temp.
3287 *
3288 * @base_offset is measured in 32-byte units (the size of a register).
3289 */
3290 void
3291 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3292 dst_reg temp, src_reg orig_src,
3293 int base_offset)
3294 {
3295 int reg_offset = base_offset + orig_src.reg_offset;
3296 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3297 reg_offset);
3298
3299 emit_before(block, inst, SCRATCH_READ(temp, index));
3300 }
3301
3302 /**
3303 * Emits an instruction after @inst to store the value to be written
3304 * to @orig_dst to scratch space at @base_offset, from @temp.
3305 *
3306 * @base_offset is measured in 32-byte units (the size of a register).
3307 */
3308 void
3309 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3310 int base_offset)
3311 {
3312 int reg_offset = base_offset + inst->dst.reg_offset;
3313 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3314 reg_offset);
3315
3316 /* Create a temporary register to store *inst's result in.
3317 *
3318 * We have to be careful in MOVing from our temporary result register in
3319 * the scratch write. If we swizzle from channels of the temporary that
3320 * weren't initialized, it will confuse live interval analysis, which will
3321 * make spilling fail to make progress.
3322 */
3323 src_reg temp = src_reg(this, glsl_type::vec4_type);
3324 temp.type = inst->dst.type;
3325 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3326 int swizzles[4];
3327 for (int i = 0; i < 4; i++)
3328 if (inst->dst.writemask & (1 << i))
3329 swizzles[i] = i;
3330 else
3331 swizzles[i] = first_writemask_chan;
3332 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3333 swizzles[2], swizzles[3]);
3334
3335 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3336 inst->dst.writemask));
3337 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3338 write->predicate = inst->predicate;
3339 write->ir = inst->ir;
3340 write->annotation = inst->annotation;
3341 inst->insert_after(block, write);
3342
3343 inst->dst.file = temp.file;
3344 inst->dst.reg = temp.reg;
3345 inst->dst.reg_offset = temp.reg_offset;
3346 inst->dst.reladdr = NULL;
3347 }
3348
3349 /**
3350 * We can't generally support array access in GRF space, because a
3351 * single instruction's destination can only span 2 contiguous
3352 * registers. So, we send all GRF arrays that get variable index
3353 * access to scratch space.
3354 */
3355 void
3356 vec4_visitor::move_grf_array_access_to_scratch()
3357 {
3358 int scratch_loc[this->alloc.count];
3359 memset(scratch_loc, -1, sizeof(scratch_loc));
3360
3361 /* First, calculate the set of virtual GRFs that need to be punted
3362 * to scratch due to having any array access on them, and where in
3363 * scratch.
3364 */
3365 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3366 if (inst->dst.file == GRF && inst->dst.reladdr &&
3367 scratch_loc[inst->dst.reg] == -1) {
3368 scratch_loc[inst->dst.reg] = c->last_scratch;
3369 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3370 }
3371
3372 for (int i = 0 ; i < 3; i++) {
3373 src_reg *src = &inst->src[i];
3374
3375 if (src->file == GRF && src->reladdr &&
3376 scratch_loc[src->reg] == -1) {
3377 scratch_loc[src->reg] = c->last_scratch;
3378 c->last_scratch += this->alloc.sizes[src->reg];
3379 }
3380 }
3381 }
3382
3383 /* Now, for anything that will be accessed through scratch, rewrite
3384 * it to load/store. Note that this is a _safe list walk, because
3385 * we may generate a new scratch_write instruction after the one
3386 * we're processing.
3387 */
3388 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3389 /* Set up the annotation tracking for new generated instructions. */
3390 base_ir = inst->ir;
3391 current_annotation = inst->annotation;
3392
3393 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3394 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3395 }
3396
3397 for (int i = 0 ; i < 3; i++) {
3398 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3399 continue;
3400
3401 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3402
3403 emit_scratch_read(block, inst, temp, inst->src[i],
3404 scratch_loc[inst->src[i].reg]);
3405
3406 inst->src[i].file = temp.file;
3407 inst->src[i].reg = temp.reg;
3408 inst->src[i].reg_offset = temp.reg_offset;
3409 inst->src[i].reladdr = NULL;
3410 }
3411 }
3412 }
3413
3414 /**
3415 * Emits an instruction before @inst to load the value named by @orig_src
3416 * from the pull constant buffer (surface) at @base_offset to @temp.
3417 */
3418 void
3419 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3420 dst_reg temp, src_reg orig_src,
3421 int base_offset)
3422 {
3423 int reg_offset = base_offset + orig_src.reg_offset;
3424 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3425 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3426 reg_offset);
3427 vec4_instruction *load;
3428
3429 if (brw->gen >= 7) {
3430 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3431 grf_offset.type = offset.type;
3432 emit_before(block, inst, MOV(grf_offset, offset));
3433
3434 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3435 temp, index, src_reg(grf_offset));
3436 } else {
3437 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3438 temp, index, offset);
3439 load->base_mrf = 14;
3440 load->mlen = 1;
3441 }
3442 emit_before(block, inst, load);
3443 }
3444
3445 /**
3446 * Implements array access of uniforms by inserting a
3447 * PULL_CONSTANT_LOAD instruction.
3448 *
3449 * Unlike temporary GRF array access (where we don't support it due to
3450 * the difficulty of doing relative addressing on instruction
3451 * destinations), we could potentially do array access of uniforms
3452 * that were loaded in GRF space as push constants. In real-world
3453 * usage we've seen, though, the arrays being used are always larger
3454 * than we could load as push constants, so just always move all
3455 * uniform array access out to a pull constant buffer.
3456 */
3457 void
3458 vec4_visitor::move_uniform_array_access_to_pull_constants()
3459 {
3460 int pull_constant_loc[this->uniforms];
3461 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3462 bool nested_reladdr;
3463
3464 /* Walk through and find array access of uniforms. Put a copy of that
3465 * uniform in the pull constant buffer.
3466 *
3467 * Note that we don't move constant-indexed accesses to arrays. No
3468 * testing has been done of the performance impact of this choice.
3469 */
3470 do {
3471 nested_reladdr = false;
3472
3473 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3474 for (int i = 0 ; i < 3; i++) {
3475 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3476 continue;
3477
3478 int uniform = inst->src[i].reg;
3479
3480 if (inst->src[i].reladdr->reladdr)
3481 nested_reladdr = true; /* will need another pass */
3482
3483 /* If this array isn't already present in the pull constant buffer,
3484 * add it.
3485 */
3486 if (pull_constant_loc[uniform] == -1) {
3487 const gl_constant_value **values =
3488 &stage_prog_data->param[uniform * 4];
3489
3490 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3491
3492 assert(uniform < uniform_array_size);
3493 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3494 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3495 = values[j];
3496 }
3497 }
3498
3499 /* Set up the annotation tracking for new generated instructions. */
3500 base_ir = inst->ir;
3501 current_annotation = inst->annotation;
3502
3503 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3504
3505 emit_pull_constant_load(block, inst, temp, inst->src[i],
3506 pull_constant_loc[uniform]);
3507
3508 inst->src[i].file = temp.file;
3509 inst->src[i].reg = temp.reg;
3510 inst->src[i].reg_offset = temp.reg_offset;
3511 inst->src[i].reladdr = NULL;
3512 }
3513 }
3514 } while (nested_reladdr);
3515
3516 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3517 * no need to track them as larger-than-vec4 objects. This will be
3518 * relied on in cutting out unused uniform vectors from push
3519 * constants.
3520 */
3521 split_uniform_registers();
3522 }
3523
3524 void
3525 vec4_visitor::resolve_ud_negate(src_reg *reg)
3526 {
3527 if (reg->type != BRW_REGISTER_TYPE_UD ||
3528 !reg->negate)
3529 return;
3530
3531 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3532 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3533 *reg = temp;
3534 }
3535
3536 /**
3537 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3538 *
3539 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3540 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3541 */
3542 void
3543 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3544 {
3545 assert(brw->gen <= 5);
3546
3547 if (!rvalue->type->is_boolean())
3548 return;
3549
3550 src_reg and_result = src_reg(this, rvalue->type);
3551 src_reg neg_result = src_reg(this, rvalue->type);
3552 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3553 emit(MOV(dst_reg(neg_result), negate(and_result)));
3554 *reg = neg_result;
3555 }
3556
3557 vec4_visitor::vec4_visitor(struct brw_context *brw,
3558 struct brw_vec4_compile *c,
3559 struct gl_program *prog,
3560 const struct brw_vue_prog_key *key,
3561 struct brw_vue_prog_data *prog_data,
3562 struct gl_shader_program *shader_prog,
3563 gl_shader_stage stage,
3564 void *mem_ctx,
3565 bool debug_flag,
3566 bool no_spills,
3567 shader_time_shader_type st_base,
3568 shader_time_shader_type st_written,
3569 shader_time_shader_type st_reset)
3570 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3571 c(c),
3572 key(key),
3573 prog_data(prog_data),
3574 sanity_param_count(0),
3575 fail_msg(NULL),
3576 first_non_payload_grf(0),
3577 need_all_constants_in_pull_buffer(false),
3578 debug_flag(debug_flag),
3579 no_spills(no_spills),
3580 st_base(st_base),
3581 st_written(st_written),
3582 st_reset(st_reset)
3583 {
3584 this->mem_ctx = mem_ctx;
3585 this->failed = false;
3586
3587 this->base_ir = NULL;
3588 this->current_annotation = NULL;
3589 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3590
3591 this->variable_ht = hash_table_ctor(0,
3592 hash_table_pointer_hash,
3593 hash_table_pointer_compare);
3594
3595 this->virtual_grf_start = NULL;
3596 this->virtual_grf_end = NULL;
3597 this->live_intervals = NULL;
3598
3599 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3600
3601 this->uniforms = 0;
3602
3603 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3604 * at least one. See setup_uniforms() in brw_vec4.cpp.
3605 */
3606 this->uniform_array_size = 1;
3607 if (prog_data) {
3608 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3609 }
3610
3611 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3612 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3613 }
3614
3615 vec4_visitor::~vec4_visitor()
3616 {
3617 hash_table_dtor(this->variable_ht);
3618 }
3619
3620
3621 void
3622 vec4_visitor::fail(const char *format, ...)
3623 {
3624 va_list va;
3625 char *msg;
3626
3627 if (failed)
3628 return;
3629
3630 failed = true;
3631
3632 va_start(va, format);
3633 msg = ralloc_vasprintf(mem_ctx, format, va);
3634 va_end(va);
3635 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3636
3637 this->fail_msg = msg;
3638
3639 if (debug_flag) {
3640 fprintf(stderr, "%s", msg);
3641 }
3642 }
3643
3644 } /* namespace brw */