ada4a0cdc4d98f92aeffa1c879c1f7f65affdf78
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->target = 0;
47 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
48 this->shadow_compare = false;
49 this->ir = NULL;
50 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
51 this->header_present = false;
52 this->mlen = 0;
53 this->base_mrf = 0;
54 this->offset = 0;
55 this->annotation = NULL;
56 }
57
58 vec4_instruction *
59 vec4_visitor::emit(vec4_instruction *inst)
60 {
61 inst->ir = this->base_ir;
62 inst->annotation = this->current_annotation;
63
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
71 vec4_instruction *new_inst)
72 {
73 new_inst->ir = inst->ir;
74 new_inst->annotation = inst->annotation;
75
76 inst->insert_before(block, new_inst);
77
78 return inst;
79 }
80
81 vec4_instruction *
82 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
83 const src_reg &src1, const src_reg &src2)
84 {
85 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
91 const src_reg &src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
119 }
120
121 #define ALU2(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
124 const src_reg &src1) \
125 { \
126 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
127 src0, src1); \
128 }
129
130 #define ALU2_ACC(op) \
131 vec4_instruction * \
132 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
133 const src_reg &src1) \
134 { \
135 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
136 BRW_OPCODE_##op, dst, src0, src1); \
137 inst->writes_accumulator = true; \
138 return inst; \
139 }
140
141 #define ALU3(op) \
142 vec4_instruction * \
143 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
144 const src_reg &src1, const src_reg &src2) \
145 { \
146 assert(brw->gen >= 6); \
147 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
148 src0, src1, src2); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU1(F32TO16)
158 ALU1(F16TO32)
159 ALU2(ADD)
160 ALU2(MUL)
161 ALU2_ACC(MACH)
162 ALU2(AND)
163 ALU2(OR)
164 ALU2(XOR)
165 ALU2(DP3)
166 ALU2(DP4)
167 ALU2(DPH)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172 ALU1(BFREV)
173 ALU3(BFE)
174 ALU2(BFI1)
175 ALU3(BFI2)
176 ALU1(FBH)
177 ALU1(FBL)
178 ALU1(CBIT)
179 ALU3(MAD)
180 ALU2_ACC(ADDC)
181 ALU2_ACC(SUBB)
182 ALU2(MAC)
183
184 /** Gen4 predicated IF. */
185 vec4_instruction *
186 vec4_visitor::IF(enum brw_predicate predicate)
187 {
188 vec4_instruction *inst;
189
190 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
191 inst->predicate = predicate;
192
193 return inst;
194 }
195
196 /** Gen6 IF with embedded comparison. */
197 vec4_instruction *
198 vec4_visitor::IF(src_reg src0, src_reg src1,
199 enum brw_conditional_mod condition)
200 {
201 assert(brw->gen == 6);
202
203 vec4_instruction *inst;
204
205 resolve_ud_negate(&src0);
206 resolve_ud_negate(&src1);
207
208 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
209 src0, src1);
210 inst->conditional_mod = condition;
211
212 return inst;
213 }
214
215 /**
216 * CMP: Sets the low bit of the destination channels with the result
217 * of the comparison, while the upper bits are undefined, and updates
218 * the flag register with the packed 16 bits of the result.
219 */
220 vec4_instruction *
221 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
222 enum brw_conditional_mod condition)
223 {
224 vec4_instruction *inst;
225
226 /* Take the instruction:
227 *
228 * CMP null<d> src0<f> src1<f>
229 *
230 * Original gen4 does type conversion to the destination type before
231 * comparison, producing garbage results for floating point comparisons.
232 *
233 * The destination type doesn't matter on newer generations, so we set the
234 * type to match src0 so we can compact the instruction.
235 */
236 dst.type = src0.type;
237 if (dst.file == HW_REG)
238 dst.fixed_hw_reg.type = dst.type;
239
240 resolve_ud_negate(&src0);
241 resolve_ud_negate(&src1);
242
243 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
244 inst->conditional_mod = condition;
245
246 return inst;
247 }
248
249 vec4_instruction *
250 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
251 {
252 vec4_instruction *inst;
253
254 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
255 dst, index);
256 inst->base_mrf = 14;
257 inst->mlen = 2;
258
259 return inst;
260 }
261
262 vec4_instruction *
263 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
264 const src_reg &index)
265 {
266 vec4_instruction *inst;
267
268 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
269 dst, src, index);
270 inst->base_mrf = 13;
271 inst->mlen = 3;
272
273 return inst;
274 }
275
276 void
277 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
278 {
279 static enum opcode dot_opcodes[] = {
280 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
281 };
282
283 emit(dot_opcodes[elements - 2], dst, src0, src1);
284 }
285
286 src_reg
287 vec4_visitor::fix_3src_operand(src_reg src)
288 {
289 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
290 * able to use vertical stride of zero to replicate the vec4 uniform, like
291 *
292 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
293 *
294 * But you can't, since vertical stride is always four in three-source
295 * instructions. Instead, insert a MOV instruction to do the replication so
296 * that the three-source instruction can consume it.
297 */
298
299 /* The MOV is only needed if the source is a uniform or immediate. */
300 if (src.file != UNIFORM && src.file != IMM)
301 return src;
302
303 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
304 return src;
305
306 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
307 expanded.type = src.type;
308 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
309 return src_reg(expanded);
310 }
311
312 src_reg
313 vec4_visitor::fix_math_operand(src_reg src)
314 {
315 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
316 return src;
317
318 /* The gen6 math instruction ignores the source modifiers --
319 * swizzle, abs, negate, and at least some parts of the register
320 * region description.
321 *
322 * Rather than trying to enumerate all these cases, *always* expand the
323 * operand to a temp GRF for gen6.
324 *
325 * For gen7, keep the operand as-is, except if immediate, which gen7 still
326 * can't use.
327 */
328
329 if (brw->gen == 7 && src.file != IMM)
330 return src;
331
332 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
333 expanded.type = src.type;
334 emit(MOV(expanded, src));
335 return src_reg(expanded);
336 }
337
338 void
339 vec4_visitor::emit_math(enum opcode opcode,
340 const dst_reg &dst,
341 const src_reg &src0, const src_reg &src1)
342 {
343 vec4_instruction *math =
344 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
345
346 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
347 /* MATH on Gen6 must be align1, so we can't do writemasks. */
348 math->dst = dst_reg(this, glsl_type::vec4_type);
349 math->dst.type = dst.type;
350 emit(MOV(dst, src_reg(math->dst)));
351 } else if (brw->gen < 6) {
352 math->base_mrf = 1;
353 math->mlen = src1.file == BAD_FILE ? 1 : 2;
354 }
355 }
356
357 void
358 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
359 {
360 if (brw->gen < 7) {
361 unreachable("ir_unop_pack_half_2x16 should be lowered");
362 }
363
364 assert(dst.type == BRW_REGISTER_TYPE_UD);
365 assert(src0.type == BRW_REGISTER_TYPE_F);
366
367 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
368 *
369 * Because this instruction does not have a 16-bit floating-point type,
370 * the destination data type must be Word (W).
371 *
372 * The destination must be DWord-aligned and specify a horizontal stride
373 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
374 * each destination channel and the upper word is not modified.
375 *
376 * The above restriction implies that the f32to16 instruction must use
377 * align1 mode, because only in align1 mode is it possible to specify
378 * horizontal stride. We choose here to defy the hardware docs and emit
379 * align16 instructions.
380 *
381 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
382 * instructions. I was partially successful in that the code passed all
383 * tests. However, the code was dubiously correct and fragile, and the
384 * tests were not harsh enough to probe that frailty. Not trusting the
385 * code, I chose instead to remain in align16 mode in defiance of the hw
386 * docs).
387 *
388 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
389 * simulator, emitting a f32to16 in align16 mode with UD as destination
390 * data type is safe. The behavior differs from that specified in the PRM
391 * in that the upper word of each destination channel is cleared to 0.
392 */
393
394 dst_reg tmp_dst(this, glsl_type::uvec2_type);
395 src_reg tmp_src(tmp_dst);
396
397 #if 0
398 /* Verify the undocumented behavior on which the following instructions
399 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
400 * then the result of the bit-or instruction below will be incorrect.
401 *
402 * You should inspect the disasm output in order to verify that the MOV is
403 * not optimized away.
404 */
405 emit(MOV(tmp_dst, src_reg(0x12345678u)));
406 #endif
407
408 /* Give tmp the form below, where "." means untouched.
409 *
410 * w z y x w z y x
411 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
412 *
413 * That the upper word of each write-channel be 0 is required for the
414 * following bit-shift and bit-or instructions to work. Note that this
415 * relies on the undocumented hardware behavior mentioned above.
416 */
417 tmp_dst.writemask = WRITEMASK_XY;
418 emit(F32TO16(tmp_dst, src0));
419
420 /* Give the write-channels of dst the form:
421 * 0xhhhh0000
422 */
423 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
424 emit(SHL(dst, tmp_src, src_reg(16u)));
425
426 /* Finally, give the write-channels of dst the form of packHalf2x16's
427 * output:
428 * 0xhhhhllll
429 */
430 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
431 emit(OR(dst, src_reg(dst), tmp_src));
432 }
433
434 void
435 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
436 {
437 if (brw->gen < 7) {
438 unreachable("ir_unop_unpack_half_2x16 should be lowered");
439 }
440
441 assert(dst.type == BRW_REGISTER_TYPE_F);
442 assert(src0.type == BRW_REGISTER_TYPE_UD);
443
444 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
445 *
446 * Because this instruction does not have a 16-bit floating-point type,
447 * the source data type must be Word (W). The destination type must be
448 * F (Float).
449 *
450 * To use W as the source data type, we must adjust horizontal strides,
451 * which is only possible in align1 mode. All my [chadv] attempts at
452 * emitting align1 instructions for unpackHalf2x16 failed to pass the
453 * Piglit tests, so I gave up.
454 *
455 * I've verified that, on gen7 hardware and the simulator, it is safe to
456 * emit f16to32 in align16 mode with UD as source data type.
457 */
458
459 dst_reg tmp_dst(this, glsl_type::uvec2_type);
460 src_reg tmp_src(tmp_dst);
461
462 tmp_dst.writemask = WRITEMASK_X;
463 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
464
465 tmp_dst.writemask = WRITEMASK_Y;
466 emit(SHR(tmp_dst, src0, src_reg(16u)));
467
468 dst.writemask = WRITEMASK_XY;
469 emit(F16TO32(dst, tmp_src));
470 }
471
472 void
473 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
474 {
475 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
476 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
477 * is not suitable to generate the shift values, but we can use the packed
478 * vector float and a type-converting MOV.
479 */
480 dst_reg shift(this, glsl_type::uvec4_type);
481 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
482
483 dst_reg shifted(this, glsl_type::uvec4_type);
484 src0.swizzle = BRW_SWIZZLE_XXXX;
485 emit(SHR(shifted, src0, src_reg(shift)));
486
487 shifted.type = BRW_REGISTER_TYPE_UB;
488 dst_reg f(this, glsl_type::vec4_type);
489 emit(MOV(f, src_reg(shifted)));
490
491 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
492 }
493
494 void
495 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
496 {
497 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
498 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
499 * is not suitable to generate the shift values, but we can use the packed
500 * vector float and a type-converting MOV.
501 */
502 dst_reg shift(this, glsl_type::uvec4_type);
503 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
504
505 dst_reg shifted(this, glsl_type::uvec4_type);
506 src0.swizzle = BRW_SWIZZLE_XXXX;
507 emit(SHR(shifted, src0, src_reg(shift)));
508
509 shifted.type = BRW_REGISTER_TYPE_B;
510 dst_reg f(this, glsl_type::vec4_type);
511 emit(MOV(f, src_reg(shifted)));
512
513 dst_reg scaled(this, glsl_type::vec4_type);
514 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
515
516 dst_reg max(this, glsl_type::vec4_type);
517 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
518 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
519 }
520
521 void
522 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
523 {
524 dst_reg saturated(this, glsl_type::vec4_type);
525 vec4_instruction *inst = emit(MOV(saturated, src0));
526 inst->saturate = true;
527
528 dst_reg scaled(this, glsl_type::vec4_type);
529 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
530
531 dst_reg rounded(this, glsl_type::vec4_type);
532 emit(RNDE(rounded, src_reg(scaled)));
533
534 dst_reg u(this, glsl_type::uvec4_type);
535 emit(MOV(u, src_reg(rounded)));
536
537 src_reg bytes(u);
538 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
539 }
540
541 void
542 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
543 {
544 dst_reg max(this, glsl_type::vec4_type);
545 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
546
547 dst_reg min(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
549
550 dst_reg scaled(this, glsl_type::vec4_type);
551 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
552
553 dst_reg rounded(this, glsl_type::vec4_type);
554 emit(RNDE(rounded, src_reg(scaled)));
555
556 dst_reg i(this, glsl_type::ivec4_type);
557 emit(MOV(i, src_reg(rounded)));
558
559 src_reg bytes(i);
560 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
561 }
562
563 void
564 vec4_visitor::visit_instructions(const exec_list *list)
565 {
566 foreach_in_list(ir_instruction, ir, list) {
567 base_ir = ir;
568 ir->accept(this);
569 }
570 }
571
572
573 static int
574 type_size(const struct glsl_type *type)
575 {
576 unsigned int i;
577 int size;
578
579 switch (type->base_type) {
580 case GLSL_TYPE_UINT:
581 case GLSL_TYPE_INT:
582 case GLSL_TYPE_FLOAT:
583 case GLSL_TYPE_BOOL:
584 if (type->is_matrix()) {
585 return type->matrix_columns;
586 } else {
587 /* Regardless of size of vector, it gets a vec4. This is bad
588 * packing for things like floats, but otherwise arrays become a
589 * mess. Hopefully a later pass over the code can pack scalars
590 * down if appropriate.
591 */
592 return 1;
593 }
594 case GLSL_TYPE_ARRAY:
595 assert(type->length > 0);
596 return type_size(type->fields.array) * type->length;
597 case GLSL_TYPE_STRUCT:
598 size = 0;
599 for (i = 0; i < type->length; i++) {
600 size += type_size(type->fields.structure[i].type);
601 }
602 return size;
603 case GLSL_TYPE_SAMPLER:
604 /* Samplers take up no register space, since they're baked in at
605 * link time.
606 */
607 return 0;
608 case GLSL_TYPE_ATOMIC_UINT:
609 return 0;
610 case GLSL_TYPE_IMAGE:
611 case GLSL_TYPE_VOID:
612 case GLSL_TYPE_ERROR:
613 case GLSL_TYPE_INTERFACE:
614 unreachable("not reached");
615 }
616
617 return 0;
618 }
619
620 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
621 {
622 init();
623
624 this->file = GRF;
625 this->reg = v->alloc.allocate(type_size(type));
626
627 if (type->is_array() || type->is_record()) {
628 this->swizzle = BRW_SWIZZLE_NOOP;
629 } else {
630 this->swizzle = swizzle_for_size(type->vector_elements);
631 }
632
633 this->type = brw_type_for_base_type(type);
634 }
635
636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
637 {
638 assert(size > 0);
639
640 init();
641
642 this->file = GRF;
643 this->reg = v->alloc.allocate(type_size(type) * size);
644
645 this->swizzle = BRW_SWIZZLE_NOOP;
646
647 this->type = brw_type_for_base_type(type);
648 }
649
650 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
651 {
652 init();
653
654 this->file = GRF;
655 this->reg = v->alloc.allocate(type_size(type));
656
657 if (type->is_array() || type->is_record()) {
658 this->writemask = WRITEMASK_XYZW;
659 } else {
660 this->writemask = (1 << type->vector_elements) - 1;
661 }
662
663 this->type = brw_type_for_base_type(type);
664 }
665
666 /* Our support for uniforms is piggy-backed on the struct
667 * gl_fragment_program, because that's where the values actually
668 * get stored, rather than in some global gl_shader_program uniform
669 * store.
670 */
671 void
672 vec4_visitor::setup_uniform_values(ir_variable *ir)
673 {
674 int namelen = strlen(ir->name);
675
676 /* The data for our (non-builtin) uniforms is stored in a series of
677 * gl_uniform_driver_storage structs for each subcomponent that
678 * glGetUniformLocation() could name. We know it's been set up in the same
679 * order we'd walk the type, so walk the list of storage and find anything
680 * with our name, or the prefix of a component that starts with our name.
681 */
682 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
683 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
684
685 if (strncmp(ir->name, storage->name, namelen) != 0 ||
686 (storage->name[namelen] != 0 &&
687 storage->name[namelen] != '.' &&
688 storage->name[namelen] != '[')) {
689 continue;
690 }
691
692 gl_constant_value *components = storage->storage;
693 unsigned vector_count = (MAX2(storage->array_elements, 1) *
694 storage->type->matrix_columns);
695
696 for (unsigned s = 0; s < vector_count; s++) {
697 assert(uniforms < uniform_array_size);
698 uniform_vector_size[uniforms] = storage->type->vector_elements;
699
700 int i;
701 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
702 stage_prog_data->param[uniforms * 4 + i] = components;
703 components++;
704 }
705 for (; i < 4; i++) {
706 static gl_constant_value zero = { 0.0 };
707 stage_prog_data->param[uniforms * 4 + i] = &zero;
708 }
709
710 uniforms++;
711 }
712 }
713 }
714
715 void
716 vec4_visitor::setup_uniform_clipplane_values()
717 {
718 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
719
720 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
721 assert(this->uniforms < uniform_array_size);
722 this->uniform_vector_size[this->uniforms] = 4;
723 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
724 this->userplane[i].type = BRW_REGISTER_TYPE_F;
725 for (int j = 0; j < 4; ++j) {
726 stage_prog_data->param[this->uniforms * 4 + j] =
727 (gl_constant_value *) &clip_planes[i][j];
728 }
729 ++this->uniforms;
730 }
731 }
732
733 /* Our support for builtin uniforms is even scarier than non-builtin.
734 * It sits on top of the PROG_STATE_VAR parameters that are
735 * automatically updated from GL context state.
736 */
737 void
738 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
739 {
740 const ir_state_slot *const slots = ir->get_state_slots();
741 assert(slots != NULL);
742
743 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
744 /* This state reference has already been setup by ir_to_mesa,
745 * but we'll get the same index back here. We can reference
746 * ParameterValues directly, since unlike brw_fs.cpp, we never
747 * add new state references during compile.
748 */
749 int index = _mesa_add_state_reference(this->prog->Parameters,
750 (gl_state_index *)slots[i].tokens);
751 gl_constant_value *values =
752 &this->prog->Parameters->ParameterValues[index][0];
753
754 assert(this->uniforms < uniform_array_size);
755 this->uniform_vector_size[this->uniforms] = 0;
756 /* Add each of the unique swizzled channels of the element.
757 * This will end up matching the size of the glsl_type of this field.
758 */
759 int last_swiz = -1;
760 for (unsigned int j = 0; j < 4; j++) {
761 int swiz = GET_SWZ(slots[i].swizzle, j);
762 last_swiz = swiz;
763
764 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
765 assert(this->uniforms < uniform_array_size);
766 if (swiz <= last_swiz)
767 this->uniform_vector_size[this->uniforms]++;
768 }
769 this->uniforms++;
770 }
771 }
772
773 dst_reg *
774 vec4_visitor::variable_storage(ir_variable *var)
775 {
776 return (dst_reg *)hash_table_find(this->variable_ht, var);
777 }
778
779 void
780 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
781 enum brw_predicate *predicate)
782 {
783 ir_expression *expr = ir->as_expression();
784
785 *predicate = BRW_PREDICATE_NORMAL;
786
787 if (expr && expr->operation != ir_binop_ubo_load) {
788 src_reg op[3];
789 vec4_instruction *inst;
790
791 assert(expr->get_num_operands() <= 3);
792 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
793 expr->operands[i]->accept(this);
794 op[i] = this->result;
795
796 resolve_ud_negate(&op[i]);
797 }
798
799 switch (expr->operation) {
800 case ir_unop_logic_not:
801 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
802 inst->conditional_mod = BRW_CONDITIONAL_Z;
803 break;
804
805 case ir_binop_logic_xor:
806 if (brw->gen <= 5) {
807 src_reg temp = src_reg(this, ir->type);
808 emit(XOR(dst_reg(temp), op[0], op[1]));
809 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
810 } else {
811 inst = emit(XOR(dst_null_d(), op[0], op[1]));
812 }
813 inst->conditional_mod = BRW_CONDITIONAL_NZ;
814 break;
815
816 case ir_binop_logic_or:
817 if (brw->gen <= 5) {
818 src_reg temp = src_reg(this, ir->type);
819 emit(OR(dst_reg(temp), op[0], op[1]));
820 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
821 } else {
822 inst = emit(OR(dst_null_d(), op[0], op[1]));
823 }
824 inst->conditional_mod = BRW_CONDITIONAL_NZ;
825 break;
826
827 case ir_binop_logic_and:
828 if (brw->gen <= 5) {
829 src_reg temp = src_reg(this, ir->type);
830 emit(AND(dst_reg(temp), op[0], op[1]));
831 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
832 } else {
833 inst = emit(AND(dst_null_d(), op[0], op[1]));
834 }
835 inst->conditional_mod = BRW_CONDITIONAL_NZ;
836 break;
837
838 case ir_unop_f2b:
839 if (brw->gen >= 6) {
840 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
841 } else {
842 inst = emit(MOV(dst_null_f(), op[0]));
843 inst->conditional_mod = BRW_CONDITIONAL_NZ;
844 }
845 break;
846
847 case ir_unop_i2b:
848 if (brw->gen >= 6) {
849 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
850 } else {
851 inst = emit(MOV(dst_null_d(), op[0]));
852 inst->conditional_mod = BRW_CONDITIONAL_NZ;
853 }
854 break;
855
856 case ir_binop_all_equal:
857 if (brw->gen <= 5) {
858 resolve_bool_comparison(expr->operands[0], &op[0]);
859 resolve_bool_comparison(expr->operands[1], &op[1]);
860 }
861 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
862 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
863 break;
864
865 case ir_binop_any_nequal:
866 if (brw->gen <= 5) {
867 resolve_bool_comparison(expr->operands[0], &op[0]);
868 resolve_bool_comparison(expr->operands[1], &op[1]);
869 }
870 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
871 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
872 break;
873
874 case ir_unop_any:
875 if (brw->gen <= 5) {
876 resolve_bool_comparison(expr->operands[0], &op[0]);
877 }
878 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
879 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
880 break;
881
882 case ir_binop_greater:
883 case ir_binop_gequal:
884 case ir_binop_less:
885 case ir_binop_lequal:
886 case ir_binop_equal:
887 case ir_binop_nequal:
888 if (brw->gen <= 5) {
889 resolve_bool_comparison(expr->operands[0], &op[0]);
890 resolve_bool_comparison(expr->operands[1], &op[1]);
891 }
892 emit(CMP(dst_null_d(), op[0], op[1],
893 brw_conditional_for_comparison(expr->operation)));
894 break;
895
896 case ir_triop_csel: {
897 /* Expand the boolean condition into the flag register. */
898 inst = emit(MOV(dst_null_d(), op[0]));
899 inst->conditional_mod = BRW_CONDITIONAL_NZ;
900
901 /* Select which boolean to return. */
902 dst_reg temp(this, expr->operands[1]->type);
903 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
904 inst->predicate = BRW_PREDICATE_NORMAL;
905
906 /* Expand the result to a condition code. */
907 inst = emit(MOV(dst_null_d(), src_reg(temp)));
908 inst->conditional_mod = BRW_CONDITIONAL_NZ;
909 break;
910 }
911
912 default:
913 unreachable("not reached");
914 }
915 return;
916 }
917
918 ir->accept(this);
919
920 resolve_ud_negate(&this->result);
921
922 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
923 inst->conditional_mod = BRW_CONDITIONAL_NZ;
924 }
925
926 /**
927 * Emit a gen6 IF statement with the comparison folded into the IF
928 * instruction.
929 */
930 void
931 vec4_visitor::emit_if_gen6(ir_if *ir)
932 {
933 ir_expression *expr = ir->condition->as_expression();
934
935 if (expr && expr->operation != ir_binop_ubo_load) {
936 src_reg op[3];
937 dst_reg temp;
938
939 assert(expr->get_num_operands() <= 3);
940 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
941 expr->operands[i]->accept(this);
942 op[i] = this->result;
943 }
944
945 switch (expr->operation) {
946 case ir_unop_logic_not:
947 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
948 return;
949
950 case ir_binop_logic_xor:
951 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
952 return;
953
954 case ir_binop_logic_or:
955 temp = dst_reg(this, glsl_type::bool_type);
956 emit(OR(temp, op[0], op[1]));
957 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
958 return;
959
960 case ir_binop_logic_and:
961 temp = dst_reg(this, glsl_type::bool_type);
962 emit(AND(temp, op[0], op[1]));
963 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
964 return;
965
966 case ir_unop_f2b:
967 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_unop_i2b:
971 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_binop_greater:
975 case ir_binop_gequal:
976 case ir_binop_less:
977 case ir_binop_lequal:
978 case ir_binop_equal:
979 case ir_binop_nequal:
980 emit(IF(op[0], op[1],
981 brw_conditional_for_comparison(expr->operation)));
982 return;
983
984 case ir_binop_all_equal:
985 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
986 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
987 return;
988
989 case ir_binop_any_nequal:
990 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
991 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
992 return;
993
994 case ir_unop_any:
995 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
996 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
997 return;
998
999 case ir_triop_csel: {
1000 /* Expand the boolean condition into the flag register. */
1001 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1002 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1003
1004 /* Select which boolean to return. */
1005 dst_reg temp(this, expr->operands[1]->type);
1006 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1007 inst->predicate = BRW_PREDICATE_NORMAL;
1008
1009 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1010 return;
1011 }
1012
1013 default:
1014 unreachable("not reached");
1015 }
1016 return;
1017 }
1018
1019 ir->condition->accept(this);
1020
1021 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_variable *ir)
1026 {
1027 dst_reg *reg = NULL;
1028
1029 if (variable_storage(ir))
1030 return;
1031
1032 switch (ir->data.mode) {
1033 case ir_var_shader_in:
1034 assert(ir->data.location != -1);
1035 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1036 break;
1037
1038 case ir_var_shader_out:
1039 assert(ir->data.location != -1);
1040 reg = new(mem_ctx) dst_reg(this, ir->type);
1041
1042 for (int i = 0; i < type_size(ir->type); i++) {
1043 output_reg[ir->data.location + i] = *reg;
1044 output_reg[ir->data.location + i].reg_offset = i;
1045 output_reg[ir->data.location + i].type =
1046 brw_type_for_base_type(ir->type->get_scalar_type());
1047 output_reg_annotation[ir->data.location + i] = ir->name;
1048 }
1049 break;
1050
1051 case ir_var_auto:
1052 case ir_var_temporary:
1053 reg = new(mem_ctx) dst_reg(this, ir->type);
1054 break;
1055
1056 case ir_var_uniform:
1057 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1058
1059 /* Thanks to the lower_ubo_reference pass, we will see only
1060 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1061 * variables, so no need for them to be in variable_ht.
1062 *
1063 * Some uniforms, such as samplers and atomic counters, have no actual
1064 * storage, so we should ignore them.
1065 */
1066 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1067 return;
1068
1069 /* Track how big the whole uniform variable is, in case we need to put a
1070 * copy of its data into pull constants for array access.
1071 */
1072 assert(this->uniforms < uniform_array_size);
1073 this->uniform_size[this->uniforms] = type_size(ir->type);
1074
1075 if (!strncmp(ir->name, "gl_", 3)) {
1076 setup_builtin_uniform_values(ir);
1077 } else {
1078 setup_uniform_values(ir);
1079 }
1080 break;
1081
1082 case ir_var_system_value:
1083 reg = make_reg_for_system_value(ir);
1084 break;
1085
1086 default:
1087 unreachable("not reached");
1088 }
1089
1090 reg->type = brw_type_for_base_type(ir->type);
1091 hash_table_insert(this->variable_ht, reg, ir);
1092 }
1093
1094 void
1095 vec4_visitor::visit(ir_loop *ir)
1096 {
1097 /* We don't want debugging output to print the whole body of the
1098 * loop as the annotation.
1099 */
1100 this->base_ir = NULL;
1101
1102 emit(BRW_OPCODE_DO);
1103
1104 visit_instructions(&ir->body_instructions);
1105
1106 emit(BRW_OPCODE_WHILE);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop_jump *ir)
1111 {
1112 switch (ir->mode) {
1113 case ir_loop_jump::jump_break:
1114 emit(BRW_OPCODE_BREAK);
1115 break;
1116 case ir_loop_jump::jump_continue:
1117 emit(BRW_OPCODE_CONTINUE);
1118 break;
1119 }
1120 }
1121
1122
1123 void
1124 vec4_visitor::visit(ir_function_signature *)
1125 {
1126 unreachable("not reached");
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_function *ir)
1131 {
1132 /* Ignore function bodies other than main() -- we shouldn't see calls to
1133 * them since they should all be inlined.
1134 */
1135 if (strcmp(ir->name, "main") == 0) {
1136 const ir_function_signature *sig;
1137 exec_list empty;
1138
1139 sig = ir->matching_signature(NULL, &empty, false);
1140
1141 assert(sig);
1142
1143 visit_instructions(&sig->body);
1144 }
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_mad(ir_expression *ir)
1149 {
1150 /* 3-src instructions were introduced in gen6. */
1151 if (brw->gen < 6)
1152 return false;
1153
1154 /* MAD can only handle floating-point data. */
1155 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1156 return false;
1157
1158 ir_rvalue *nonmul = ir->operands[1];
1159 ir_expression *mul = ir->operands[0]->as_expression();
1160
1161 if (!mul || mul->operation != ir_binop_mul) {
1162 nonmul = ir->operands[0];
1163 mul = ir->operands[1]->as_expression();
1164
1165 if (!mul || mul->operation != ir_binop_mul)
1166 return false;
1167 }
1168
1169 nonmul->accept(this);
1170 src_reg src0 = fix_3src_operand(this->result);
1171
1172 mul->operands[0]->accept(this);
1173 src_reg src1 = fix_3src_operand(this->result);
1174
1175 mul->operands[1]->accept(this);
1176 src_reg src2 = fix_3src_operand(this->result);
1177
1178 this->result = src_reg(this, ir->type);
1179 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1180
1181 return true;
1182 }
1183
1184 bool
1185 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1186 {
1187 /* This optimization relies on CMP setting the destination to 0 when
1188 * false. Early hardware only sets the least significant bit, and
1189 * leaves the other bits undefined. So we can't use it.
1190 */
1191 if (brw->gen < 6)
1192 return false;
1193
1194 ir_expression *const cmp = ir->operands[0]->as_expression();
1195
1196 if (cmp == NULL)
1197 return false;
1198
1199 switch (cmp->operation) {
1200 case ir_binop_less:
1201 case ir_binop_greater:
1202 case ir_binop_lequal:
1203 case ir_binop_gequal:
1204 case ir_binop_equal:
1205 case ir_binop_nequal:
1206 break;
1207
1208 default:
1209 return false;
1210 }
1211
1212 cmp->operands[0]->accept(this);
1213 const src_reg cmp_src0 = this->result;
1214
1215 cmp->operands[1]->accept(this);
1216 const src_reg cmp_src1 = this->result;
1217
1218 this->result = src_reg(this, ir->type);
1219
1220 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1221 brw_conditional_for_comparison(cmp->operation)));
1222
1223 /* If the comparison is false, this->result will just happen to be zero.
1224 */
1225 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1226 this->result, src_reg(1.0f));
1227 inst->predicate = BRW_PREDICATE_NORMAL;
1228 inst->predicate_inverse = true;
1229
1230 return true;
1231 }
1232
1233 void
1234 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1235 src_reg src0, src_reg src1)
1236 {
1237 vec4_instruction *inst;
1238
1239 if (brw->gen >= 6) {
1240 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1241 inst->conditional_mod = conditionalmod;
1242 } else {
1243 emit(CMP(dst, src0, src1, conditionalmod));
1244
1245 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1246 inst->predicate = BRW_PREDICATE_NORMAL;
1247 }
1248 }
1249
1250 void
1251 vec4_visitor::emit_lrp(const dst_reg &dst,
1252 const src_reg &x, const src_reg &y, const src_reg &a)
1253 {
1254 if (brw->gen >= 6) {
1255 /* Note that the instruction's argument order is reversed from GLSL
1256 * and the IR.
1257 */
1258 emit(LRP(dst,
1259 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1260 } else {
1261 /* Earlier generations don't support three source operations, so we
1262 * need to emit x*(1-a) + y*a.
1263 */
1264 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1265 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1266 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1267 y_times_a.writemask = dst.writemask;
1268 one_minus_a.writemask = dst.writemask;
1269 x_times_one_minus_a.writemask = dst.writemask;
1270
1271 emit(MUL(y_times_a, y, a));
1272 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1273 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1274 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1275 }
1276 }
1277
1278 void
1279 vec4_visitor::visit(ir_expression *ir)
1280 {
1281 unsigned int operand;
1282 src_reg op[Elements(ir->operands)];
1283 vec4_instruction *inst;
1284
1285 if (ir->operation == ir_binop_add) {
1286 if (try_emit_mad(ir))
1287 return;
1288 }
1289
1290 if (ir->operation == ir_unop_b2f) {
1291 if (try_emit_b2f_of_compare(ir))
1292 return;
1293 }
1294
1295 /* Storage for our result. Ideally for an assignment we'd be using
1296 * the actual storage for the result here, instead.
1297 */
1298 dst_reg result_dst(this, ir->type);
1299 src_reg result_src(result_dst);
1300
1301 if (ir->operation == ir_triop_csel) {
1302 ir->operands[1]->accept(this);
1303 op[1] = this->result;
1304 ir->operands[2]->accept(this);
1305 op[2] = this->result;
1306
1307 enum brw_predicate predicate;
1308 emit_bool_to_cond_code(ir->operands[0], &predicate);
1309 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1310 inst->predicate = predicate;
1311 this->result = result_src;
1312 return;
1313 }
1314
1315 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1316 this->result.file = BAD_FILE;
1317 ir->operands[operand]->accept(this);
1318 if (this->result.file == BAD_FILE) {
1319 fprintf(stderr, "Failed to get tree for expression operand:\n");
1320 ir->operands[operand]->fprint(stderr);
1321 exit(1);
1322 }
1323 op[operand] = this->result;
1324
1325 /* Matrix expression operands should have been broken down to vector
1326 * operations already.
1327 */
1328 assert(!ir->operands[operand]->type->is_matrix());
1329 }
1330
1331 /* If nothing special happens, this is the result. */
1332 this->result = result_src;
1333
1334 switch (ir->operation) {
1335 case ir_unop_logic_not:
1336 emit(NOT(result_dst, op[0]));
1337 break;
1338 case ir_unop_neg:
1339 op[0].negate = !op[0].negate;
1340 emit(MOV(result_dst, op[0]));
1341 break;
1342 case ir_unop_abs:
1343 op[0].abs = true;
1344 op[0].negate = false;
1345 emit(MOV(result_dst, op[0]));
1346 break;
1347
1348 case ir_unop_sign:
1349 if (ir->type->is_float()) {
1350 /* AND(val, 0x80000000) gives the sign bit.
1351 *
1352 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1353 * zero.
1354 */
1355 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1356
1357 op[0].type = BRW_REGISTER_TYPE_UD;
1358 result_dst.type = BRW_REGISTER_TYPE_UD;
1359 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1360
1361 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1362 inst->predicate = BRW_PREDICATE_NORMAL;
1363
1364 this->result.type = BRW_REGISTER_TYPE_F;
1365 } else {
1366 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1367 * -> non-negative val generates 0x00000000.
1368 * Predicated OR sets 1 if val is positive.
1369 */
1370 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1371
1372 emit(ASR(result_dst, op[0], src_reg(31)));
1373
1374 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1375 inst->predicate = BRW_PREDICATE_NORMAL;
1376 }
1377 break;
1378
1379 case ir_unop_rcp:
1380 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1381 break;
1382
1383 case ir_unop_exp2:
1384 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1385 break;
1386 case ir_unop_log2:
1387 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1388 break;
1389 case ir_unop_exp:
1390 case ir_unop_log:
1391 unreachable("not reached: should be handled by ir_explog_to_explog2");
1392 case ir_unop_sin:
1393 case ir_unop_sin_reduced:
1394 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1395 break;
1396 case ir_unop_cos:
1397 case ir_unop_cos_reduced:
1398 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1399 break;
1400
1401 case ir_unop_dFdx:
1402 case ir_unop_dFdx_coarse:
1403 case ir_unop_dFdx_fine:
1404 case ir_unop_dFdy:
1405 case ir_unop_dFdy_coarse:
1406 case ir_unop_dFdy_fine:
1407 unreachable("derivatives not valid in vertex shader");
1408
1409 case ir_unop_bitfield_reverse:
1410 emit(BFREV(result_dst, op[0]));
1411 break;
1412 case ir_unop_bit_count:
1413 emit(CBIT(result_dst, op[0]));
1414 break;
1415 case ir_unop_find_msb: {
1416 src_reg temp = src_reg(this, glsl_type::uint_type);
1417
1418 inst = emit(FBH(dst_reg(temp), op[0]));
1419 inst->dst.writemask = WRITEMASK_XYZW;
1420
1421 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1422 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1423 * subtract the result from 31 to convert the MSB count into an LSB count.
1424 */
1425
1426 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1427 temp.swizzle = BRW_SWIZZLE_NOOP;
1428 emit(MOV(result_dst, temp));
1429
1430 src_reg src_tmp = src_reg(result_dst);
1431 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1432
1433 src_tmp.negate = true;
1434 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1435 inst->predicate = BRW_PREDICATE_NORMAL;
1436 break;
1437 }
1438 case ir_unop_find_lsb:
1439 emit(FBL(result_dst, op[0]));
1440 break;
1441 case ir_unop_saturate:
1442 inst = emit(MOV(result_dst, op[0]));
1443 inst->saturate = true;
1444 break;
1445
1446 case ir_unop_noise:
1447 unreachable("not reached: should be handled by lower_noise");
1448
1449 case ir_binop_add:
1450 emit(ADD(result_dst, op[0], op[1]));
1451 break;
1452 case ir_binop_sub:
1453 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1454
1455 case ir_binop_mul:
1456 if (brw->gen < 8 && ir->type->is_integer()) {
1457 /* For integer multiplication, the MUL uses the low 16 bits of one of
1458 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1459 * accumulates in the contribution of the upper 16 bits of that
1460 * operand. If we can determine that one of the args is in the low
1461 * 16 bits, though, we can just emit a single MUL.
1462 */
1463 if (ir->operands[0]->is_uint16_constant()) {
1464 if (brw->gen < 7)
1465 emit(MUL(result_dst, op[0], op[1]));
1466 else
1467 emit(MUL(result_dst, op[1], op[0]));
1468 } else if (ir->operands[1]->is_uint16_constant()) {
1469 if (brw->gen < 7)
1470 emit(MUL(result_dst, op[1], op[0]));
1471 else
1472 emit(MUL(result_dst, op[0], op[1]));
1473 } else {
1474 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1475
1476 emit(MUL(acc, op[0], op[1]));
1477 emit(MACH(dst_null_d(), op[0], op[1]));
1478 emit(MOV(result_dst, src_reg(acc)));
1479 }
1480 } else {
1481 emit(MUL(result_dst, op[0], op[1]));
1482 }
1483 break;
1484 case ir_binop_imul_high: {
1485 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1486
1487 emit(MUL(acc, op[0], op[1]));
1488 emit(MACH(result_dst, op[0], op[1]));
1489 break;
1490 }
1491 case ir_binop_div:
1492 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1493 assert(ir->type->is_integer());
1494 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1495 break;
1496 case ir_binop_carry: {
1497 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1498
1499 emit(ADDC(dst_null_ud(), op[0], op[1]));
1500 emit(MOV(result_dst, src_reg(acc)));
1501 break;
1502 }
1503 case ir_binop_borrow: {
1504 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1505
1506 emit(SUBB(dst_null_ud(), op[0], op[1]));
1507 emit(MOV(result_dst, src_reg(acc)));
1508 break;
1509 }
1510 case ir_binop_mod:
1511 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1512 assert(ir->type->is_integer());
1513 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1514 break;
1515
1516 case ir_binop_less:
1517 case ir_binop_greater:
1518 case ir_binop_lequal:
1519 case ir_binop_gequal:
1520 case ir_binop_equal:
1521 case ir_binop_nequal: {
1522 if (brw->gen <= 5) {
1523 resolve_bool_comparison(ir->operands[0], &op[0]);
1524 resolve_bool_comparison(ir->operands[1], &op[1]);
1525 }
1526 emit(CMP(result_dst, op[0], op[1],
1527 brw_conditional_for_comparison(ir->operation)));
1528 break;
1529 }
1530
1531 case ir_binop_all_equal:
1532 /* "==" operator producing a scalar boolean. */
1533 if (ir->operands[0]->type->is_vector() ||
1534 ir->operands[1]->type->is_vector()) {
1535 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1536 emit(MOV(result_dst, src_reg(0)));
1537 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1538 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1539 } else {
1540 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1541 }
1542 break;
1543 case ir_binop_any_nequal:
1544 /* "!=" operator producing a scalar boolean. */
1545 if (ir->operands[0]->type->is_vector() ||
1546 ir->operands[1]->type->is_vector()) {
1547 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1548
1549 emit(MOV(result_dst, src_reg(0)));
1550 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1551 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1552 } else {
1553 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1554 }
1555 break;
1556
1557 case ir_unop_any:
1558 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1559 emit(MOV(result_dst, src_reg(0)));
1560
1561 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1562 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1563 break;
1564
1565 case ir_binop_logic_xor:
1566 emit(XOR(result_dst, op[0], op[1]));
1567 break;
1568
1569 case ir_binop_logic_or:
1570 emit(OR(result_dst, op[0], op[1]));
1571 break;
1572
1573 case ir_binop_logic_and:
1574 emit(AND(result_dst, op[0], op[1]));
1575 break;
1576
1577 case ir_binop_dot:
1578 assert(ir->operands[0]->type->is_vector());
1579 assert(ir->operands[0]->type == ir->operands[1]->type);
1580 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1581 break;
1582
1583 case ir_unop_sqrt:
1584 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1585 break;
1586 case ir_unop_rsq:
1587 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1588 break;
1589
1590 case ir_unop_bitcast_i2f:
1591 case ir_unop_bitcast_u2f:
1592 this->result = op[0];
1593 this->result.type = BRW_REGISTER_TYPE_F;
1594 break;
1595
1596 case ir_unop_bitcast_f2i:
1597 this->result = op[0];
1598 this->result.type = BRW_REGISTER_TYPE_D;
1599 break;
1600
1601 case ir_unop_bitcast_f2u:
1602 this->result = op[0];
1603 this->result.type = BRW_REGISTER_TYPE_UD;
1604 break;
1605
1606 case ir_unop_i2f:
1607 case ir_unop_i2u:
1608 case ir_unop_u2i:
1609 case ir_unop_u2f:
1610 case ir_unop_f2i:
1611 case ir_unop_f2u:
1612 emit(MOV(result_dst, op[0]));
1613 break;
1614 case ir_unop_b2i:
1615 emit(AND(result_dst, op[0], src_reg(1)));
1616 break;
1617 case ir_unop_b2f:
1618 if (brw->gen <= 5) {
1619 resolve_bool_comparison(ir->operands[0], &op[0]);
1620 }
1621 op[0].type = BRW_REGISTER_TYPE_D;
1622 result_dst.type = BRW_REGISTER_TYPE_D;
1623 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1624 result_dst.type = BRW_REGISTER_TYPE_F;
1625 break;
1626 case ir_unop_f2b:
1627 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1628 break;
1629 case ir_unop_i2b:
1630 emit(AND(result_dst, op[0], src_reg(1)));
1631 break;
1632
1633 case ir_unop_trunc:
1634 emit(RNDZ(result_dst, op[0]));
1635 break;
1636 case ir_unop_ceil: {
1637 src_reg tmp = src_reg(this, ir->type);
1638 op[0].negate = !op[0].negate;
1639 emit(RNDD(dst_reg(tmp), op[0]));
1640 tmp.negate = true;
1641 emit(MOV(result_dst, tmp));
1642 }
1643 break;
1644 case ir_unop_floor:
1645 inst = emit(RNDD(result_dst, op[0]));
1646 break;
1647 case ir_unop_fract:
1648 inst = emit(FRC(result_dst, op[0]));
1649 break;
1650 case ir_unop_round_even:
1651 emit(RNDE(result_dst, op[0]));
1652 break;
1653
1654 case ir_binop_min:
1655 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1656 break;
1657 case ir_binop_max:
1658 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1659 break;
1660
1661 case ir_binop_pow:
1662 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1663 break;
1664
1665 case ir_unop_bit_not:
1666 inst = emit(NOT(result_dst, op[0]));
1667 break;
1668 case ir_binop_bit_and:
1669 inst = emit(AND(result_dst, op[0], op[1]));
1670 break;
1671 case ir_binop_bit_xor:
1672 inst = emit(XOR(result_dst, op[0], op[1]));
1673 break;
1674 case ir_binop_bit_or:
1675 inst = emit(OR(result_dst, op[0], op[1]));
1676 break;
1677
1678 case ir_binop_lshift:
1679 inst = emit(SHL(result_dst, op[0], op[1]));
1680 break;
1681
1682 case ir_binop_rshift:
1683 if (ir->type->base_type == GLSL_TYPE_INT)
1684 inst = emit(ASR(result_dst, op[0], op[1]));
1685 else
1686 inst = emit(SHR(result_dst, op[0], op[1]));
1687 break;
1688
1689 case ir_binop_bfm:
1690 emit(BFI1(result_dst, op[0], op[1]));
1691 break;
1692
1693 case ir_binop_ubo_load: {
1694 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1695 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1696 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1697 src_reg offset;
1698
1699 /* Now, load the vector from that offset. */
1700 assert(ir->type->is_vector() || ir->type->is_scalar());
1701
1702 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1703 packed_consts.type = result.type;
1704 src_reg surf_index;
1705
1706 if (const_uniform_block) {
1707 /* The block index is a constant, so just emit the binding table entry
1708 * as an immediate.
1709 */
1710 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1711 const_uniform_block->value.u[0]);
1712 } else {
1713 /* The block index is not a constant. Evaluate the index expression
1714 * per-channel and add the base UBO index; the generator will select
1715 * a value from any live channel.
1716 */
1717 surf_index = src_reg(this, glsl_type::uint_type);
1718 emit(ADD(dst_reg(surf_index), op[0],
1719 src_reg(prog_data->base.binding_table.ubo_start)));
1720
1721 /* Assume this may touch any UBO. It would be nice to provide
1722 * a tighter bound, but the array information is already lowered away.
1723 */
1724 brw_mark_surface_used(&prog_data->base,
1725 prog_data->base.binding_table.ubo_start +
1726 shader_prog->NumUniformBlocks - 1);
1727 }
1728
1729 if (const_offset_ir) {
1730 if (brw->gen >= 8) {
1731 /* Store the offset in a GRF so we can send-from-GRF. */
1732 offset = src_reg(this, glsl_type::int_type);
1733 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1734 } else {
1735 /* Immediates are fine on older generations since they'll be moved
1736 * to a (potentially fake) MRF at the generator level.
1737 */
1738 offset = src_reg(const_offset / 16);
1739 }
1740 } else {
1741 offset = src_reg(this, glsl_type::uint_type);
1742 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1743 }
1744
1745 if (brw->gen >= 7) {
1746 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1747 grf_offset.type = offset.type;
1748
1749 emit(MOV(grf_offset, offset));
1750
1751 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1752 dst_reg(packed_consts),
1753 surf_index,
1754 src_reg(grf_offset)));
1755 } else {
1756 vec4_instruction *pull =
1757 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1758 dst_reg(packed_consts),
1759 surf_index,
1760 offset));
1761 pull->base_mrf = 14;
1762 pull->mlen = 1;
1763 }
1764
1765 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1766 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1767 const_offset % 16 / 4,
1768 const_offset % 16 / 4,
1769 const_offset % 16 / 4);
1770
1771 /* UBO bools are any nonzero int. We need to convert them to use the
1772 * value of true stored in ctx->Const.UniformBooleanTrue.
1773 */
1774 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1775 emit(CMP(result_dst, packed_consts, src_reg(0u),
1776 BRW_CONDITIONAL_NZ));
1777 } else {
1778 emit(MOV(result_dst, packed_consts));
1779 }
1780 break;
1781 }
1782
1783 case ir_binop_vector_extract:
1784 unreachable("should have been lowered by vec_index_to_cond_assign");
1785
1786 case ir_triop_fma:
1787 op[0] = fix_3src_operand(op[0]);
1788 op[1] = fix_3src_operand(op[1]);
1789 op[2] = fix_3src_operand(op[2]);
1790 /* Note that the instruction's argument order is reversed from GLSL
1791 * and the IR.
1792 */
1793 emit(MAD(result_dst, op[2], op[1], op[0]));
1794 break;
1795
1796 case ir_triop_lrp:
1797 emit_lrp(result_dst, op[0], op[1], op[2]);
1798 break;
1799
1800 case ir_triop_csel:
1801 unreachable("already handled above");
1802 break;
1803
1804 case ir_triop_bfi:
1805 op[0] = fix_3src_operand(op[0]);
1806 op[1] = fix_3src_operand(op[1]);
1807 op[2] = fix_3src_operand(op[2]);
1808 emit(BFI2(result_dst, op[0], op[1], op[2]));
1809 break;
1810
1811 case ir_triop_bitfield_extract:
1812 op[0] = fix_3src_operand(op[0]);
1813 op[1] = fix_3src_operand(op[1]);
1814 op[2] = fix_3src_operand(op[2]);
1815 /* Note that the instruction's argument order is reversed from GLSL
1816 * and the IR.
1817 */
1818 emit(BFE(result_dst, op[2], op[1], op[0]));
1819 break;
1820
1821 case ir_triop_vector_insert:
1822 unreachable("should have been lowered by lower_vector_insert");
1823
1824 case ir_quadop_bitfield_insert:
1825 unreachable("not reached: should be handled by "
1826 "bitfield_insert_to_bfm_bfi\n");
1827
1828 case ir_quadop_vector:
1829 unreachable("not reached: should be handled by lower_quadop_vector");
1830
1831 case ir_unop_pack_half_2x16:
1832 emit_pack_half_2x16(result_dst, op[0]);
1833 break;
1834 case ir_unop_unpack_half_2x16:
1835 emit_unpack_half_2x16(result_dst, op[0]);
1836 break;
1837 case ir_unop_unpack_unorm_4x8:
1838 emit_unpack_unorm_4x8(result_dst, op[0]);
1839 break;
1840 case ir_unop_unpack_snorm_4x8:
1841 emit_unpack_snorm_4x8(result_dst, op[0]);
1842 break;
1843 case ir_unop_pack_unorm_4x8:
1844 emit_pack_unorm_4x8(result_dst, op[0]);
1845 break;
1846 case ir_unop_pack_snorm_4x8:
1847 emit_pack_snorm_4x8(result_dst, op[0]);
1848 break;
1849 case ir_unop_pack_snorm_2x16:
1850 case ir_unop_pack_unorm_2x16:
1851 case ir_unop_unpack_snorm_2x16:
1852 case ir_unop_unpack_unorm_2x16:
1853 unreachable("not reached: should be handled by lower_packing_builtins");
1854 case ir_unop_unpack_half_2x16_split_x:
1855 case ir_unop_unpack_half_2x16_split_y:
1856 case ir_binop_pack_half_2x16_split:
1857 case ir_unop_interpolate_at_centroid:
1858 case ir_binop_interpolate_at_sample:
1859 case ir_binop_interpolate_at_offset:
1860 unreachable("not reached: should not occur in vertex shader");
1861 case ir_binop_ldexp:
1862 unreachable("not reached: should be handled by ldexp_to_arith()");
1863 }
1864 }
1865
1866
1867 void
1868 vec4_visitor::visit(ir_swizzle *ir)
1869 {
1870 src_reg src;
1871 int i = 0;
1872 int swizzle[4];
1873
1874 /* Note that this is only swizzles in expressions, not those on the left
1875 * hand side of an assignment, which do write masking. See ir_assignment
1876 * for that.
1877 */
1878
1879 ir->val->accept(this);
1880 src = this->result;
1881 assert(src.file != BAD_FILE);
1882
1883 for (i = 0; i < ir->type->vector_elements; i++) {
1884 switch (i) {
1885 case 0:
1886 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1887 break;
1888 case 1:
1889 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1890 break;
1891 case 2:
1892 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1893 break;
1894 case 3:
1895 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1896 break;
1897 }
1898 }
1899 for (; i < 4; i++) {
1900 /* Replicate the last channel out. */
1901 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1902 }
1903
1904 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1905
1906 this->result = src;
1907 }
1908
1909 void
1910 vec4_visitor::visit(ir_dereference_variable *ir)
1911 {
1912 const struct glsl_type *type = ir->type;
1913 dst_reg *reg = variable_storage(ir->var);
1914
1915 if (!reg) {
1916 fail("Failed to find variable storage for %s\n", ir->var->name);
1917 this->result = src_reg(brw_null_reg());
1918 return;
1919 }
1920
1921 this->result = src_reg(*reg);
1922
1923 /* System values get their swizzle from the dst_reg writemask */
1924 if (ir->var->data.mode == ir_var_system_value)
1925 return;
1926
1927 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1928 this->result.swizzle = swizzle_for_size(type->vector_elements);
1929 }
1930
1931
1932 int
1933 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1934 {
1935 /* Under normal circumstances array elements are stored consecutively, so
1936 * the stride is equal to the size of the array element.
1937 */
1938 return type_size(ir->type);
1939 }
1940
1941
1942 void
1943 vec4_visitor::visit(ir_dereference_array *ir)
1944 {
1945 ir_constant *constant_index;
1946 src_reg src;
1947 int array_stride = compute_array_stride(ir);
1948
1949 constant_index = ir->array_index->constant_expression_value();
1950
1951 ir->array->accept(this);
1952 src = this->result;
1953
1954 if (constant_index) {
1955 src.reg_offset += constant_index->value.i[0] * array_stride;
1956 } else {
1957 /* Variable index array dereference. It eats the "vec4" of the
1958 * base of the array and an index that offsets the Mesa register
1959 * index.
1960 */
1961 ir->array_index->accept(this);
1962
1963 src_reg index_reg;
1964
1965 if (array_stride == 1) {
1966 index_reg = this->result;
1967 } else {
1968 index_reg = src_reg(this, glsl_type::int_type);
1969
1970 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1971 }
1972
1973 if (src.reladdr) {
1974 src_reg temp = src_reg(this, glsl_type::int_type);
1975
1976 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1977
1978 index_reg = temp;
1979 }
1980
1981 src.reladdr = ralloc(mem_ctx, src_reg);
1982 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1983 }
1984
1985 /* If the type is smaller than a vec4, replicate the last channel out. */
1986 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1987 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1988 else
1989 src.swizzle = BRW_SWIZZLE_NOOP;
1990 src.type = brw_type_for_base_type(ir->type);
1991
1992 this->result = src;
1993 }
1994
1995 void
1996 vec4_visitor::visit(ir_dereference_record *ir)
1997 {
1998 unsigned int i;
1999 const glsl_type *struct_type = ir->record->type;
2000 int offset = 0;
2001
2002 ir->record->accept(this);
2003
2004 for (i = 0; i < struct_type->length; i++) {
2005 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2006 break;
2007 offset += type_size(struct_type->fields.structure[i].type);
2008 }
2009
2010 /* If the type is smaller than a vec4, replicate the last channel out. */
2011 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2012 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2013 else
2014 this->result.swizzle = BRW_SWIZZLE_NOOP;
2015 this->result.type = brw_type_for_base_type(ir->type);
2016
2017 this->result.reg_offset += offset;
2018 }
2019
2020 /**
2021 * We want to be careful in assignment setup to hit the actual storage
2022 * instead of potentially using a temporary like we might with the
2023 * ir_dereference handler.
2024 */
2025 static dst_reg
2026 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2027 {
2028 /* The LHS must be a dereference. If the LHS is a variable indexed array
2029 * access of a vector, it must be separated into a series conditional moves
2030 * before reaching this point (see ir_vec_index_to_cond_assign).
2031 */
2032 assert(ir->as_dereference());
2033 ir_dereference_array *deref_array = ir->as_dereference_array();
2034 if (deref_array) {
2035 assert(!deref_array->array->type->is_vector());
2036 }
2037
2038 /* Use the rvalue deref handler for the most part. We'll ignore
2039 * swizzles in it and write swizzles using writemask, though.
2040 */
2041 ir->accept(v);
2042 return dst_reg(v->result);
2043 }
2044
2045 void
2046 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2047 const struct glsl_type *type,
2048 enum brw_predicate predicate)
2049 {
2050 if (type->base_type == GLSL_TYPE_STRUCT) {
2051 for (unsigned int i = 0; i < type->length; i++) {
2052 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2053 }
2054 return;
2055 }
2056
2057 if (type->is_array()) {
2058 for (unsigned int i = 0; i < type->length; i++) {
2059 emit_block_move(dst, src, type->fields.array, predicate);
2060 }
2061 return;
2062 }
2063
2064 if (type->is_matrix()) {
2065 const struct glsl_type *vec_type;
2066
2067 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2068 type->vector_elements, 1);
2069
2070 for (int i = 0; i < type->matrix_columns; i++) {
2071 emit_block_move(dst, src, vec_type, predicate);
2072 }
2073 return;
2074 }
2075
2076 assert(type->is_scalar() || type->is_vector());
2077
2078 dst->type = brw_type_for_base_type(type);
2079 src->type = dst->type;
2080
2081 dst->writemask = (1 << type->vector_elements) - 1;
2082
2083 src->swizzle = swizzle_for_size(type->vector_elements);
2084
2085 vec4_instruction *inst = emit(MOV(*dst, *src));
2086 inst->predicate = predicate;
2087
2088 dst->reg_offset++;
2089 src->reg_offset++;
2090 }
2091
2092
2093 /* If the RHS processing resulted in an instruction generating a
2094 * temporary value, and it would be easy to rewrite the instruction to
2095 * generate its result right into the LHS instead, do so. This ends
2096 * up reliably removing instructions where it can be tricky to do so
2097 * later without real UD chain information.
2098 */
2099 bool
2100 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2101 dst_reg dst,
2102 src_reg src,
2103 vec4_instruction *pre_rhs_inst,
2104 vec4_instruction *last_rhs_inst)
2105 {
2106 /* This could be supported, but it would take more smarts. */
2107 if (ir->condition)
2108 return false;
2109
2110 if (pre_rhs_inst == last_rhs_inst)
2111 return false; /* No instructions generated to work with. */
2112
2113 /* Make sure the last instruction generated our source reg. */
2114 if (src.file != GRF ||
2115 src.file != last_rhs_inst->dst.file ||
2116 src.reg != last_rhs_inst->dst.reg ||
2117 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2118 src.reladdr ||
2119 src.abs ||
2120 src.negate ||
2121 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2122 return false;
2123
2124 /* Check that that last instruction fully initialized the channels
2125 * we want to use, in the order we want to use them. We could
2126 * potentially reswizzle the operands of many instructions so that
2127 * we could handle out of order channels, but don't yet.
2128 */
2129
2130 for (unsigned i = 0; i < 4; i++) {
2131 if (dst.writemask & (1 << i)) {
2132 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2133 return false;
2134
2135 if (BRW_GET_SWZ(src.swizzle, i) != i)
2136 return false;
2137 }
2138 }
2139
2140 /* Success! Rewrite the instruction. */
2141 last_rhs_inst->dst.file = dst.file;
2142 last_rhs_inst->dst.reg = dst.reg;
2143 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2144 last_rhs_inst->dst.reladdr = dst.reladdr;
2145 last_rhs_inst->dst.writemask &= dst.writemask;
2146
2147 return true;
2148 }
2149
2150 void
2151 vec4_visitor::visit(ir_assignment *ir)
2152 {
2153 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2154 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2155
2156 if (!ir->lhs->type->is_scalar() &&
2157 !ir->lhs->type->is_vector()) {
2158 ir->rhs->accept(this);
2159 src_reg src = this->result;
2160
2161 if (ir->condition) {
2162 emit_bool_to_cond_code(ir->condition, &predicate);
2163 }
2164
2165 /* emit_block_move doesn't account for swizzles in the source register.
2166 * This should be ok, since the source register is a structure or an
2167 * array, and those can't be swizzled. But double-check to be sure.
2168 */
2169 assert(src.swizzle ==
2170 (ir->rhs->type->is_matrix()
2171 ? swizzle_for_size(ir->rhs->type->vector_elements)
2172 : BRW_SWIZZLE_NOOP));
2173
2174 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2175 return;
2176 }
2177
2178 /* Now we're down to just a scalar/vector with writemasks. */
2179 int i;
2180
2181 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2182 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2183
2184 ir->rhs->accept(this);
2185
2186 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2187
2188 src_reg src = this->result;
2189
2190 int swizzles[4];
2191 int first_enabled_chan = 0;
2192 int src_chan = 0;
2193
2194 assert(ir->lhs->type->is_vector() ||
2195 ir->lhs->type->is_scalar());
2196 dst.writemask = ir->write_mask;
2197
2198 for (int i = 0; i < 4; i++) {
2199 if (dst.writemask & (1 << i)) {
2200 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2201 break;
2202 }
2203 }
2204
2205 /* Swizzle a small RHS vector into the channels being written.
2206 *
2207 * glsl ir treats write_mask as dictating how many channels are
2208 * present on the RHS while in our instructions we need to make
2209 * those channels appear in the slots of the vec4 they're written to.
2210 */
2211 for (int i = 0; i < 4; i++) {
2212 if (dst.writemask & (1 << i))
2213 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2214 else
2215 swizzles[i] = first_enabled_chan;
2216 }
2217 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2218 swizzles[2], swizzles[3]);
2219
2220 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2221 return;
2222 }
2223
2224 if (ir->condition) {
2225 emit_bool_to_cond_code(ir->condition, &predicate);
2226 }
2227
2228 for (i = 0; i < type_size(ir->lhs->type); i++) {
2229 vec4_instruction *inst = emit(MOV(dst, src));
2230 inst->predicate = predicate;
2231
2232 dst.reg_offset++;
2233 src.reg_offset++;
2234 }
2235 }
2236
2237 void
2238 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2239 {
2240 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2241 foreach_in_list(ir_constant, field_value, &ir->components) {
2242 emit_constant_values(dst, field_value);
2243 }
2244 return;
2245 }
2246
2247 if (ir->type->is_array()) {
2248 for (unsigned int i = 0; i < ir->type->length; i++) {
2249 emit_constant_values(dst, ir->array_elements[i]);
2250 }
2251 return;
2252 }
2253
2254 if (ir->type->is_matrix()) {
2255 for (int i = 0; i < ir->type->matrix_columns; i++) {
2256 float *vec = &ir->value.f[i * ir->type->vector_elements];
2257
2258 for (int j = 0; j < ir->type->vector_elements; j++) {
2259 dst->writemask = 1 << j;
2260 dst->type = BRW_REGISTER_TYPE_F;
2261
2262 emit(MOV(*dst, src_reg(vec[j])));
2263 }
2264 dst->reg_offset++;
2265 }
2266 return;
2267 }
2268
2269 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2270
2271 for (int i = 0; i < ir->type->vector_elements; i++) {
2272 if (!(remaining_writemask & (1 << i)))
2273 continue;
2274
2275 dst->writemask = 1 << i;
2276 dst->type = brw_type_for_base_type(ir->type);
2277
2278 /* Find other components that match the one we're about to
2279 * write. Emits fewer instructions for things like vec4(0.5,
2280 * 1.5, 1.5, 1.5).
2281 */
2282 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2283 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2284 if (ir->value.b[i] == ir->value.b[j])
2285 dst->writemask |= (1 << j);
2286 } else {
2287 /* u, i, and f storage all line up, so no need for a
2288 * switch case for comparing each type.
2289 */
2290 if (ir->value.u[i] == ir->value.u[j])
2291 dst->writemask |= (1 << j);
2292 }
2293 }
2294
2295 switch (ir->type->base_type) {
2296 case GLSL_TYPE_FLOAT:
2297 emit(MOV(*dst, src_reg(ir->value.f[i])));
2298 break;
2299 case GLSL_TYPE_INT:
2300 emit(MOV(*dst, src_reg(ir->value.i[i])));
2301 break;
2302 case GLSL_TYPE_UINT:
2303 emit(MOV(*dst, src_reg(ir->value.u[i])));
2304 break;
2305 case GLSL_TYPE_BOOL:
2306 emit(MOV(*dst,
2307 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2308 : 0)));
2309 break;
2310 default:
2311 unreachable("Non-float/uint/int/bool constant");
2312 }
2313
2314 remaining_writemask &= ~dst->writemask;
2315 }
2316 dst->reg_offset++;
2317 }
2318
2319 void
2320 vec4_visitor::visit(ir_constant *ir)
2321 {
2322 dst_reg dst = dst_reg(this, ir->type);
2323 this->result = src_reg(dst);
2324
2325 emit_constant_values(&dst, ir);
2326 }
2327
2328 void
2329 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2330 {
2331 ir_dereference *deref = static_cast<ir_dereference *>(
2332 ir->actual_parameters.get_head());
2333 ir_variable *location = deref->variable_referenced();
2334 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2335 location->data.binding);
2336
2337 /* Calculate the surface offset */
2338 src_reg offset(this, glsl_type::uint_type);
2339 ir_dereference_array *deref_array = deref->as_dereference_array();
2340 if (deref_array) {
2341 deref_array->array_index->accept(this);
2342
2343 src_reg tmp(this, glsl_type::uint_type);
2344 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2345 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2346 } else {
2347 offset = location->data.atomic.offset;
2348 }
2349
2350 /* Emit the appropriate machine instruction */
2351 const char *callee = ir->callee->function_name();
2352 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2353
2354 if (!strcmp("__intrinsic_atomic_read", callee)) {
2355 emit_untyped_surface_read(surf_index, dst, offset);
2356
2357 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2358 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2359 src_reg(), src_reg());
2360
2361 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2362 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2363 src_reg(), src_reg());
2364 }
2365 }
2366
2367 void
2368 vec4_visitor::visit(ir_call *ir)
2369 {
2370 const char *callee = ir->callee->function_name();
2371
2372 if (!strcmp("__intrinsic_atomic_read", callee) ||
2373 !strcmp("__intrinsic_atomic_increment", callee) ||
2374 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2375 visit_atomic_counter_intrinsic(ir);
2376 } else {
2377 unreachable("Unsupported intrinsic.");
2378 }
2379 }
2380
2381 src_reg
2382 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2383 {
2384 vec4_instruction *inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS);
2385 inst->base_mrf = 2;
2386 inst->mlen = 1;
2387 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2388 inst->dst.writemask = WRITEMASK_XYZW;
2389
2390 inst->src[1] = sampler;
2391
2392 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2393 int param_base = inst->base_mrf;
2394 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2395 int zero_mask = 0xf & ~coord_mask;
2396
2397 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2398 coordinate));
2399
2400 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2401 src_reg(0)));
2402
2403 emit(inst);
2404 return src_reg(inst->dst);
2405 }
2406
2407 static bool
2408 is_high_sampler(struct brw_context *brw, src_reg sampler)
2409 {
2410 if (brw->gen < 8 && !brw->is_haswell)
2411 return false;
2412
2413 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2414 }
2415
2416 void
2417 vec4_visitor::visit(ir_texture *ir)
2418 {
2419 uint32_t sampler =
2420 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2421
2422 ir_rvalue *nonconst_sampler_index =
2423 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2424
2425 /* Handle non-constant sampler array indexing */
2426 src_reg sampler_reg;
2427 if (nonconst_sampler_index) {
2428 /* The highest sampler which may be used by this operation is
2429 * the last element of the array. Mark it here, because the generator
2430 * doesn't have enough information to determine the bound.
2431 */
2432 uint32_t array_size = ir->sampler->as_dereference_array()
2433 ->array->type->array_size();
2434
2435 uint32_t max_used = sampler + array_size - 1;
2436 if (ir->op == ir_tg4 && brw->gen < 8) {
2437 max_used += prog_data->base.binding_table.gather_texture_start;
2438 } else {
2439 max_used += prog_data->base.binding_table.texture_start;
2440 }
2441
2442 brw_mark_surface_used(&prog_data->base, max_used);
2443
2444 /* Emit code to evaluate the actual indexing expression */
2445 nonconst_sampler_index->accept(this);
2446 dst_reg temp(this, glsl_type::uint_type);
2447 emit(ADD(temp, this->result, src_reg(sampler)))
2448 ->force_writemask_all = true;
2449 sampler_reg = src_reg(temp);
2450 } else {
2451 /* Single sampler, or constant array index; the indexing expression
2452 * is just an immediate.
2453 */
2454 sampler_reg = src_reg(sampler);
2455 }
2456
2457 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2458 * emitting anything other than setting up the constant result.
2459 */
2460 if (ir->op == ir_tg4) {
2461 ir_constant *chan = ir->lod_info.component->as_constant();
2462 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2463 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2464 dst_reg result(this, ir->type);
2465 this->result = src_reg(result);
2466 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2467 return;
2468 }
2469 }
2470
2471 /* Should be lowered by do_lower_texture_projection */
2472 assert(!ir->projector);
2473
2474 /* Should be lowered */
2475 assert(!ir->offset || !ir->offset->type->is_array());
2476
2477 /* Generate code to compute all the subexpression trees. This has to be
2478 * done before loading any values into MRFs for the sampler message since
2479 * generating these values may involve SEND messages that need the MRFs.
2480 */
2481 src_reg coordinate;
2482 if (ir->coordinate) {
2483 ir->coordinate->accept(this);
2484 coordinate = this->result;
2485 }
2486
2487 src_reg shadow_comparitor;
2488 if (ir->shadow_comparitor) {
2489 ir->shadow_comparitor->accept(this);
2490 shadow_comparitor = this->result;
2491 }
2492
2493 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2494 src_reg offset_value;
2495 if (has_nonconstant_offset) {
2496 ir->offset->accept(this);
2497 offset_value = src_reg(this->result);
2498 }
2499
2500 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2501 src_reg lod, dPdx, dPdy, sample_index, mcs;
2502 switch (ir->op) {
2503 case ir_tex:
2504 lod = src_reg(0.0f);
2505 lod_type = glsl_type::float_type;
2506 break;
2507 case ir_txf:
2508 case ir_txl:
2509 case ir_txs:
2510 ir->lod_info.lod->accept(this);
2511 lod = this->result;
2512 lod_type = ir->lod_info.lod->type;
2513 break;
2514 case ir_query_levels:
2515 lod = src_reg(0);
2516 lod_type = glsl_type::int_type;
2517 break;
2518 case ir_txf_ms:
2519 ir->lod_info.sample_index->accept(this);
2520 sample_index = this->result;
2521 sample_index_type = ir->lod_info.sample_index->type;
2522
2523 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2524 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2525 else
2526 mcs = src_reg(0u);
2527 break;
2528 case ir_txd:
2529 ir->lod_info.grad.dPdx->accept(this);
2530 dPdx = this->result;
2531
2532 ir->lod_info.grad.dPdy->accept(this);
2533 dPdy = this->result;
2534
2535 lod_type = ir->lod_info.grad.dPdx->type;
2536 break;
2537 case ir_txb:
2538 case ir_lod:
2539 case ir_tg4:
2540 break;
2541 }
2542
2543 enum opcode opcode;
2544 switch (ir->op) {
2545 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2546 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2547 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2548 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2549 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2550 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2551 case ir_tg4: opcode = has_nonconstant_offset
2552 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2553 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2554 case ir_txb:
2555 unreachable("TXB is not valid for vertex shaders.");
2556 case ir_lod:
2557 unreachable("LOD is not valid for vertex shaders.");
2558 default:
2559 unreachable("Unrecognized tex op");
2560 }
2561
2562 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode);
2563
2564 if (ir->offset != NULL && !has_nonconstant_offset) {
2565 inst->offset =
2566 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2567 ir->offset->type->vector_elements);
2568 }
2569
2570 /* Stuff the channel select bits in the top of the texture offset */
2571 if (ir->op == ir_tg4)
2572 inst->offset |= gather_channel(ir, sampler) << 16;
2573
2574 /* The message header is necessary for:
2575 * - Gen4 (always)
2576 * - Gen9+ for selecting SIMD4x2
2577 * - Texel offsets
2578 * - Gather channel selection
2579 * - Sampler indices too large to fit in a 4-bit value.
2580 */
2581 inst->header_present =
2582 brw->gen < 5 || brw->gen >= 9 ||
2583 inst->offset != 0 || ir->op == ir_tg4 ||
2584 is_high_sampler(brw, sampler_reg);
2585 inst->base_mrf = 2;
2586 inst->mlen = inst->header_present + 1; /* always at least one */
2587 inst->dst = dst_reg(this, ir->type);
2588 inst->dst.writemask = WRITEMASK_XYZW;
2589 inst->shadow_compare = ir->shadow_comparitor != NULL;
2590
2591 inst->src[1] = sampler_reg;
2592
2593 /* MRF for the first parameter */
2594 int param_base = inst->base_mrf + inst->header_present;
2595
2596 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2597 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2598 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2599 } else {
2600 /* Load the coordinate */
2601 /* FINISHME: gl_clamp_mask and saturate */
2602 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2603 int zero_mask = 0xf & ~coord_mask;
2604
2605 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2606 coordinate));
2607
2608 if (zero_mask != 0) {
2609 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2610 src_reg(0)));
2611 }
2612 /* Load the shadow comparitor */
2613 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2614 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2615 WRITEMASK_X),
2616 shadow_comparitor));
2617 inst->mlen++;
2618 }
2619
2620 /* Load the LOD info */
2621 if (ir->op == ir_tex || ir->op == ir_txl) {
2622 int mrf, writemask;
2623 if (brw->gen >= 5) {
2624 mrf = param_base + 1;
2625 if (ir->shadow_comparitor) {
2626 writemask = WRITEMASK_Y;
2627 /* mlen already incremented */
2628 } else {
2629 writemask = WRITEMASK_X;
2630 inst->mlen++;
2631 }
2632 } else /* brw->gen == 4 */ {
2633 mrf = param_base;
2634 writemask = WRITEMASK_W;
2635 }
2636 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2637 } else if (ir->op == ir_txf) {
2638 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2639 } else if (ir->op == ir_txf_ms) {
2640 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2641 sample_index));
2642 if (brw->gen >= 7) {
2643 /* MCS data is in the first channel of `mcs`, but we need to get it into
2644 * the .y channel of the second vec4 of params, so replicate .x across
2645 * the whole vec4 and then mask off everything except .y
2646 */
2647 mcs.swizzle = BRW_SWIZZLE_XXXX;
2648 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2649 mcs));
2650 }
2651 inst->mlen++;
2652 } else if (ir->op == ir_txd) {
2653 const glsl_type *type = lod_type;
2654
2655 if (brw->gen >= 5) {
2656 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2657 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2658 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2659 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2660 inst->mlen++;
2661
2662 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2663 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2664 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2665 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2666 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2667 inst->mlen++;
2668
2669 if (ir->shadow_comparitor) {
2670 emit(MOV(dst_reg(MRF, param_base + 2,
2671 ir->shadow_comparitor->type, WRITEMASK_Z),
2672 shadow_comparitor));
2673 }
2674 }
2675 } else /* brw->gen == 4 */ {
2676 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2677 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2678 inst->mlen += 2;
2679 }
2680 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2681 if (ir->shadow_comparitor) {
2682 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2683 shadow_comparitor));
2684 }
2685
2686 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2687 offset_value));
2688 inst->mlen++;
2689 }
2690 }
2691
2692 emit(inst);
2693
2694 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2695 * spec requires layers.
2696 */
2697 if (ir->op == ir_txs) {
2698 glsl_type const *type = ir->sampler->type;
2699 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2700 type->sampler_array) {
2701 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2702 writemask(inst->dst, WRITEMASK_Z),
2703 src_reg(inst->dst), src_reg(6));
2704 }
2705 }
2706
2707 if (brw->gen == 6 && ir->op == ir_tg4) {
2708 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2709 }
2710
2711 swizzle_result(ir, src_reg(inst->dst), sampler);
2712 }
2713
2714 /**
2715 * Apply workarounds for Gen6 gather with UINT/SINT
2716 */
2717 void
2718 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2719 {
2720 if (!wa)
2721 return;
2722
2723 int width = (wa & WA_8BIT) ? 8 : 16;
2724 dst_reg dst_f = dst;
2725 dst_f.type = BRW_REGISTER_TYPE_F;
2726
2727 /* Convert from UNORM to UINT */
2728 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2729 emit(MOV(dst, src_reg(dst_f)));
2730
2731 if (wa & WA_SIGN) {
2732 /* Reinterpret the UINT value as a signed INT value by
2733 * shifting the sign bit into place, then shifting back
2734 * preserving sign.
2735 */
2736 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2737 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2738 }
2739 }
2740
2741 /**
2742 * Set up the gather channel based on the swizzle, for gather4.
2743 */
2744 uint32_t
2745 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2746 {
2747 ir_constant *chan = ir->lod_info.component->as_constant();
2748 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2749 switch (swiz) {
2750 case SWIZZLE_X: return 0;
2751 case SWIZZLE_Y:
2752 /* gather4 sampler is broken for green channel on RG32F --
2753 * we must ask for blue instead.
2754 */
2755 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2756 return 2;
2757 return 1;
2758 case SWIZZLE_Z: return 2;
2759 case SWIZZLE_W: return 3;
2760 default:
2761 unreachable("Not reached"); /* zero, one swizzles handled already */
2762 }
2763 }
2764
2765 void
2766 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2767 {
2768 int s = key->tex.swizzles[sampler];
2769
2770 this->result = src_reg(this, ir->type);
2771 dst_reg swizzled_result(this->result);
2772
2773 if (ir->op == ir_query_levels) {
2774 /* # levels is in .w */
2775 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2776 emit(MOV(swizzled_result, orig_val));
2777 return;
2778 }
2779
2780 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2781 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2782 emit(MOV(swizzled_result, orig_val));
2783 return;
2784 }
2785
2786
2787 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2788 int swizzle[4] = {0};
2789
2790 for (int i = 0; i < 4; i++) {
2791 switch (GET_SWZ(s, i)) {
2792 case SWIZZLE_ZERO:
2793 zero_mask |= (1 << i);
2794 break;
2795 case SWIZZLE_ONE:
2796 one_mask |= (1 << i);
2797 break;
2798 default:
2799 copy_mask |= (1 << i);
2800 swizzle[i] = GET_SWZ(s, i);
2801 break;
2802 }
2803 }
2804
2805 if (copy_mask) {
2806 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2807 swizzled_result.writemask = copy_mask;
2808 emit(MOV(swizzled_result, orig_val));
2809 }
2810
2811 if (zero_mask) {
2812 swizzled_result.writemask = zero_mask;
2813 emit(MOV(swizzled_result, src_reg(0.0f)));
2814 }
2815
2816 if (one_mask) {
2817 swizzled_result.writemask = one_mask;
2818 emit(MOV(swizzled_result, src_reg(1.0f)));
2819 }
2820 }
2821
2822 void
2823 vec4_visitor::visit(ir_return *)
2824 {
2825 unreachable("not reached");
2826 }
2827
2828 void
2829 vec4_visitor::visit(ir_discard *)
2830 {
2831 unreachable("not reached");
2832 }
2833
2834 void
2835 vec4_visitor::visit(ir_if *ir)
2836 {
2837 /* Don't point the annotation at the if statement, because then it plus
2838 * the then and else blocks get printed.
2839 */
2840 this->base_ir = ir->condition;
2841
2842 if (brw->gen == 6) {
2843 emit_if_gen6(ir);
2844 } else {
2845 enum brw_predicate predicate;
2846 emit_bool_to_cond_code(ir->condition, &predicate);
2847 emit(IF(predicate));
2848 }
2849
2850 visit_instructions(&ir->then_instructions);
2851
2852 if (!ir->else_instructions.is_empty()) {
2853 this->base_ir = ir->condition;
2854 emit(BRW_OPCODE_ELSE);
2855
2856 visit_instructions(&ir->else_instructions);
2857 }
2858
2859 this->base_ir = ir->condition;
2860 emit(BRW_OPCODE_ENDIF);
2861 }
2862
2863 void
2864 vec4_visitor::visit(ir_emit_vertex *)
2865 {
2866 unreachable("not reached");
2867 }
2868
2869 void
2870 vec4_visitor::visit(ir_end_primitive *)
2871 {
2872 unreachable("not reached");
2873 }
2874
2875 void
2876 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2877 dst_reg dst, src_reg offset,
2878 src_reg src0, src_reg src1)
2879 {
2880 unsigned mlen = 0;
2881
2882 /* Set the atomic operation offset. */
2883 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2884 mlen++;
2885
2886 /* Set the atomic operation arguments. */
2887 if (src0.file != BAD_FILE) {
2888 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2889 mlen++;
2890 }
2891
2892 if (src1.file != BAD_FILE) {
2893 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2894 mlen++;
2895 }
2896
2897 /* Emit the instruction. Note that this maps to the normal SIMD8
2898 * untyped atomic message on Ivy Bridge, but that's OK because
2899 * unused channels will be masked out.
2900 */
2901 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2902 src_reg(atomic_op), src_reg(surf_index));
2903 inst->base_mrf = 0;
2904 inst->mlen = mlen;
2905 }
2906
2907 void
2908 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2909 src_reg offset)
2910 {
2911 /* Set the surface read offset. */
2912 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2913
2914 /* Emit the instruction. Note that this maps to the normal SIMD8
2915 * untyped surface read message, but that's OK because unused
2916 * channels will be masked out.
2917 */
2918 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2919 dst, src_reg(surf_index));
2920 inst->base_mrf = 0;
2921 inst->mlen = 1;
2922 }
2923
2924 void
2925 vec4_visitor::emit_ndc_computation()
2926 {
2927 /* Get the position */
2928 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2929
2930 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2931 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2932 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2933
2934 current_annotation = "NDC";
2935 dst_reg ndc_w = ndc;
2936 ndc_w.writemask = WRITEMASK_W;
2937 src_reg pos_w = pos;
2938 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2939 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2940
2941 dst_reg ndc_xyz = ndc;
2942 ndc_xyz.writemask = WRITEMASK_XYZ;
2943
2944 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2945 }
2946
2947 void
2948 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2949 {
2950 if (brw->gen < 6 &&
2951 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2952 key->userclip_active || brw->has_negative_rhw_bug)) {
2953 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2954 dst_reg header1_w = header1;
2955 header1_w.writemask = WRITEMASK_W;
2956
2957 emit(MOV(header1, 0u));
2958
2959 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2960 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2961
2962 current_annotation = "Point size";
2963 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2964 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2965 }
2966
2967 if (key->userclip_active) {
2968 current_annotation = "Clipping flags";
2969 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2970 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2971
2972 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2973 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2974 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2975
2976 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2977 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2978 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2979 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2980 }
2981
2982 /* i965 clipping workaround:
2983 * 1) Test for -ve rhw
2984 * 2) If set,
2985 * set ndc = (0,0,0,0)
2986 * set ucp[6] = 1
2987 *
2988 * Later, clipping will detect ucp[6] and ensure the primitive is
2989 * clipped against all fixed planes.
2990 */
2991 if (brw->has_negative_rhw_bug) {
2992 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2993 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2994 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2995 vec4_instruction *inst;
2996 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2997 inst->predicate = BRW_PREDICATE_NORMAL;
2998 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2999 inst->predicate = BRW_PREDICATE_NORMAL;
3000 }
3001
3002 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3003 } else if (brw->gen < 6) {
3004 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3005 } else {
3006 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3007 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3008 dst_reg reg_w = reg;
3009 reg_w.writemask = WRITEMASK_W;
3010 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3011 }
3012 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3013 dst_reg reg_y = reg;
3014 reg_y.writemask = WRITEMASK_Y;
3015 reg_y.type = BRW_REGISTER_TYPE_D;
3016 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3017 }
3018 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3019 dst_reg reg_z = reg;
3020 reg_z.writemask = WRITEMASK_Z;
3021 reg_z.type = BRW_REGISTER_TYPE_D;
3022 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3023 }
3024 }
3025 }
3026
3027 void
3028 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3029 {
3030 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3031 *
3032 * "If a linked set of shaders forming the vertex stage contains no
3033 * static write to gl_ClipVertex or gl_ClipDistance, but the
3034 * application has requested clipping against user clip planes through
3035 * the API, then the coordinate written to gl_Position is used for
3036 * comparison against the user clip planes."
3037 *
3038 * This function is only called if the shader didn't write to
3039 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3040 * if the user wrote to it; otherwise we use gl_Position.
3041 */
3042 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3043 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3044 clip_vertex = VARYING_SLOT_POS;
3045 }
3046
3047 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3048 ++i) {
3049 reg.writemask = 1 << i;
3050 emit(DP4(reg,
3051 src_reg(output_reg[clip_vertex]),
3052 src_reg(this->userplane[i + offset])));
3053 }
3054 }
3055
3056 vec4_instruction *
3057 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3058 {
3059 assert (varying < VARYING_SLOT_MAX);
3060 reg.type = output_reg[varying].type;
3061 current_annotation = output_reg_annotation[varying];
3062 /* Copy the register, saturating if necessary */
3063 return emit(MOV(reg, src_reg(output_reg[varying])));
3064 }
3065
3066 void
3067 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3068 {
3069 reg.type = BRW_REGISTER_TYPE_F;
3070
3071 switch (varying) {
3072 case VARYING_SLOT_PSIZ:
3073 {
3074 /* PSIZ is always in slot 0, and is coupled with other flags. */
3075 current_annotation = "indices, point width, clip flags";
3076 emit_psiz_and_flags(reg);
3077 break;
3078 }
3079 case BRW_VARYING_SLOT_NDC:
3080 current_annotation = "NDC";
3081 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3082 break;
3083 case VARYING_SLOT_POS:
3084 current_annotation = "gl_Position";
3085 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3086 break;
3087 case VARYING_SLOT_EDGE:
3088 /* This is present when doing unfilled polygons. We're supposed to copy
3089 * the edge flag from the user-provided vertex array
3090 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3091 * of that attribute (starts as 1.0f). This is then used in clipping to
3092 * determine which edges should be drawn as wireframe.
3093 */
3094 current_annotation = "edge flag";
3095 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3096 glsl_type::float_type, WRITEMASK_XYZW))));
3097 break;
3098 case BRW_VARYING_SLOT_PAD:
3099 /* No need to write to this slot */
3100 break;
3101 case VARYING_SLOT_COL0:
3102 case VARYING_SLOT_COL1:
3103 case VARYING_SLOT_BFC0:
3104 case VARYING_SLOT_BFC1: {
3105 /* These built-in varyings are only supported in compatibility mode,
3106 * and we only support GS in core profile. So, this must be a vertex
3107 * shader.
3108 */
3109 assert(stage == MESA_SHADER_VERTEX);
3110 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3111 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3112 inst->saturate = true;
3113 break;
3114 }
3115
3116 default:
3117 emit_generic_urb_slot(reg, varying);
3118 break;
3119 }
3120 }
3121
3122 static int
3123 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3124 {
3125 if (brw->gen >= 6) {
3126 /* URB data written (does not include the message header reg) must
3127 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3128 * section 5.4.3.2.2: URB_INTERLEAVED.
3129 *
3130 * URB entries are allocated on a multiple of 1024 bits, so an
3131 * extra 128 bits written here to make the end align to 256 is
3132 * no problem.
3133 */
3134 if ((mlen % 2) != 1)
3135 mlen++;
3136 }
3137
3138 return mlen;
3139 }
3140
3141
3142 /**
3143 * Generates the VUE payload plus the necessary URB write instructions to
3144 * output it.
3145 *
3146 * The VUE layout is documented in Volume 2a.
3147 */
3148 void
3149 vec4_visitor::emit_vertex()
3150 {
3151 /* MRF 0 is reserved for the debugger, so start with message header
3152 * in MRF 1.
3153 */
3154 int base_mrf = 1;
3155 int mrf = base_mrf;
3156 /* In the process of generating our URB write message contents, we
3157 * may need to unspill a register or load from an array. Those
3158 * reads would use MRFs 14-15.
3159 */
3160 int max_usable_mrf = 13;
3161
3162 /* The following assertion verifies that max_usable_mrf causes an
3163 * even-numbered amount of URB write data, which will meet gen6's
3164 * requirements for length alignment.
3165 */
3166 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3167
3168 /* First mrf is the g0-based message header containing URB handles and
3169 * such.
3170 */
3171 emit_urb_write_header(mrf++);
3172
3173 if (brw->gen < 6) {
3174 emit_ndc_computation();
3175 }
3176
3177 /* Lower legacy ff and ClipVertex clipping to clip distances */
3178 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3179 current_annotation = "user clip distances";
3180
3181 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3182 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3183
3184 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3185 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3186 }
3187
3188 /* We may need to split this up into several URB writes, so do them in a
3189 * loop.
3190 */
3191 int slot = 0;
3192 bool complete = false;
3193 do {
3194 /* URB offset is in URB row increments, and each of our MRFs is half of
3195 * one of those, since we're doing interleaved writes.
3196 */
3197 int offset = slot / 2;
3198
3199 mrf = base_mrf + 1;
3200 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3201 emit_urb_slot(dst_reg(MRF, mrf++),
3202 prog_data->vue_map.slot_to_varying[slot]);
3203
3204 /* If this was max_usable_mrf, we can't fit anything more into this
3205 * URB WRITE.
3206 */
3207 if (mrf > max_usable_mrf) {
3208 slot++;
3209 break;
3210 }
3211 }
3212
3213 complete = slot >= prog_data->vue_map.num_slots;
3214 current_annotation = "URB write";
3215 vec4_instruction *inst = emit_urb_write_opcode(complete);
3216 inst->base_mrf = base_mrf;
3217 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3218 inst->offset += offset;
3219 } while(!complete);
3220 }
3221
3222
3223 src_reg
3224 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3225 src_reg *reladdr, int reg_offset)
3226 {
3227 /* Because we store the values to scratch interleaved like our
3228 * vertex data, we need to scale the vec4 index by 2.
3229 */
3230 int message_header_scale = 2;
3231
3232 /* Pre-gen6, the message header uses byte offsets instead of vec4
3233 * (16-byte) offset units.
3234 */
3235 if (brw->gen < 6)
3236 message_header_scale *= 16;
3237
3238 if (reladdr) {
3239 src_reg index = src_reg(this, glsl_type::int_type);
3240
3241 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3242 src_reg(reg_offset)));
3243 emit_before(block, inst, MUL(dst_reg(index), index,
3244 src_reg(message_header_scale)));
3245
3246 return index;
3247 } else {
3248 return src_reg(reg_offset * message_header_scale);
3249 }
3250 }
3251
3252 src_reg
3253 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3254 src_reg *reladdr, int reg_offset)
3255 {
3256 if (reladdr) {
3257 src_reg index = src_reg(this, glsl_type::int_type);
3258
3259 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3260 src_reg(reg_offset)));
3261
3262 /* Pre-gen6, the message header uses byte offsets instead of vec4
3263 * (16-byte) offset units.
3264 */
3265 if (brw->gen < 6) {
3266 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3267 }
3268
3269 return index;
3270 } else if (brw->gen >= 8) {
3271 /* Store the offset in a GRF so we can send-from-GRF. */
3272 src_reg offset = src_reg(this, glsl_type::int_type);
3273 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3274 return offset;
3275 } else {
3276 int message_header_scale = brw->gen < 6 ? 16 : 1;
3277 return src_reg(reg_offset * message_header_scale);
3278 }
3279 }
3280
3281 /**
3282 * Emits an instruction before @inst to load the value named by @orig_src
3283 * from scratch space at @base_offset to @temp.
3284 *
3285 * @base_offset is measured in 32-byte units (the size of a register).
3286 */
3287 void
3288 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3289 dst_reg temp, src_reg orig_src,
3290 int base_offset)
3291 {
3292 int reg_offset = base_offset + orig_src.reg_offset;
3293 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3294 reg_offset);
3295
3296 emit_before(block, inst, SCRATCH_READ(temp, index));
3297 }
3298
3299 /**
3300 * Emits an instruction after @inst to store the value to be written
3301 * to @orig_dst to scratch space at @base_offset, from @temp.
3302 *
3303 * @base_offset is measured in 32-byte units (the size of a register).
3304 */
3305 void
3306 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3307 int base_offset)
3308 {
3309 int reg_offset = base_offset + inst->dst.reg_offset;
3310 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3311 reg_offset);
3312
3313 /* Create a temporary register to store *inst's result in.
3314 *
3315 * We have to be careful in MOVing from our temporary result register in
3316 * the scratch write. If we swizzle from channels of the temporary that
3317 * weren't initialized, it will confuse live interval analysis, which will
3318 * make spilling fail to make progress.
3319 */
3320 src_reg temp = src_reg(this, glsl_type::vec4_type);
3321 temp.type = inst->dst.type;
3322 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3323 int swizzles[4];
3324 for (int i = 0; i < 4; i++)
3325 if (inst->dst.writemask & (1 << i))
3326 swizzles[i] = i;
3327 else
3328 swizzles[i] = first_writemask_chan;
3329 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3330 swizzles[2], swizzles[3]);
3331
3332 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3333 inst->dst.writemask));
3334 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3335 write->predicate = inst->predicate;
3336 write->ir = inst->ir;
3337 write->annotation = inst->annotation;
3338 inst->insert_after(block, write);
3339
3340 inst->dst.file = temp.file;
3341 inst->dst.reg = temp.reg;
3342 inst->dst.reg_offset = temp.reg_offset;
3343 inst->dst.reladdr = NULL;
3344 }
3345
3346 /**
3347 * We can't generally support array access in GRF space, because a
3348 * single instruction's destination can only span 2 contiguous
3349 * registers. So, we send all GRF arrays that get variable index
3350 * access to scratch space.
3351 */
3352 void
3353 vec4_visitor::move_grf_array_access_to_scratch()
3354 {
3355 int scratch_loc[this->alloc.count];
3356 memset(scratch_loc, -1, sizeof(scratch_loc));
3357
3358 /* First, calculate the set of virtual GRFs that need to be punted
3359 * to scratch due to having any array access on them, and where in
3360 * scratch.
3361 */
3362 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3363 if (inst->dst.file == GRF && inst->dst.reladdr &&
3364 scratch_loc[inst->dst.reg] == -1) {
3365 scratch_loc[inst->dst.reg] = c->last_scratch;
3366 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3367 }
3368
3369 for (int i = 0 ; i < 3; i++) {
3370 src_reg *src = &inst->src[i];
3371
3372 if (src->file == GRF && src->reladdr &&
3373 scratch_loc[src->reg] == -1) {
3374 scratch_loc[src->reg] = c->last_scratch;
3375 c->last_scratch += this->alloc.sizes[src->reg];
3376 }
3377 }
3378 }
3379
3380 /* Now, for anything that will be accessed through scratch, rewrite
3381 * it to load/store. Note that this is a _safe list walk, because
3382 * we may generate a new scratch_write instruction after the one
3383 * we're processing.
3384 */
3385 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3386 /* Set up the annotation tracking for new generated instructions. */
3387 base_ir = inst->ir;
3388 current_annotation = inst->annotation;
3389
3390 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3391 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3392 }
3393
3394 for (int i = 0 ; i < 3; i++) {
3395 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3396 continue;
3397
3398 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3399
3400 emit_scratch_read(block, inst, temp, inst->src[i],
3401 scratch_loc[inst->src[i].reg]);
3402
3403 inst->src[i].file = temp.file;
3404 inst->src[i].reg = temp.reg;
3405 inst->src[i].reg_offset = temp.reg_offset;
3406 inst->src[i].reladdr = NULL;
3407 }
3408 }
3409 }
3410
3411 /**
3412 * Emits an instruction before @inst to load the value named by @orig_src
3413 * from the pull constant buffer (surface) at @base_offset to @temp.
3414 */
3415 void
3416 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3417 dst_reg temp, src_reg orig_src,
3418 int base_offset)
3419 {
3420 int reg_offset = base_offset + orig_src.reg_offset;
3421 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3422 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3423 reg_offset);
3424 vec4_instruction *load;
3425
3426 if (brw->gen >= 7) {
3427 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3428 grf_offset.type = offset.type;
3429 emit_before(block, inst, MOV(grf_offset, offset));
3430
3431 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3432 temp, index, src_reg(grf_offset));
3433 } else {
3434 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3435 temp, index, offset);
3436 load->base_mrf = 14;
3437 load->mlen = 1;
3438 }
3439 emit_before(block, inst, load);
3440 }
3441
3442 /**
3443 * Implements array access of uniforms by inserting a
3444 * PULL_CONSTANT_LOAD instruction.
3445 *
3446 * Unlike temporary GRF array access (where we don't support it due to
3447 * the difficulty of doing relative addressing on instruction
3448 * destinations), we could potentially do array access of uniforms
3449 * that were loaded in GRF space as push constants. In real-world
3450 * usage we've seen, though, the arrays being used are always larger
3451 * than we could load as push constants, so just always move all
3452 * uniform array access out to a pull constant buffer.
3453 */
3454 void
3455 vec4_visitor::move_uniform_array_access_to_pull_constants()
3456 {
3457 int pull_constant_loc[this->uniforms];
3458 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3459 bool nested_reladdr;
3460
3461 /* Walk through and find array access of uniforms. Put a copy of that
3462 * uniform in the pull constant buffer.
3463 *
3464 * Note that we don't move constant-indexed accesses to arrays. No
3465 * testing has been done of the performance impact of this choice.
3466 */
3467 do {
3468 nested_reladdr = false;
3469
3470 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3471 for (int i = 0 ; i < 3; i++) {
3472 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3473 continue;
3474
3475 int uniform = inst->src[i].reg;
3476
3477 if (inst->src[i].reladdr->reladdr)
3478 nested_reladdr = true; /* will need another pass */
3479
3480 /* If this array isn't already present in the pull constant buffer,
3481 * add it.
3482 */
3483 if (pull_constant_loc[uniform] == -1) {
3484 const gl_constant_value **values =
3485 &stage_prog_data->param[uniform * 4];
3486
3487 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3488
3489 assert(uniform < uniform_array_size);
3490 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3491 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3492 = values[j];
3493 }
3494 }
3495
3496 /* Set up the annotation tracking for new generated instructions. */
3497 base_ir = inst->ir;
3498 current_annotation = inst->annotation;
3499
3500 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3501
3502 emit_pull_constant_load(block, inst, temp, inst->src[i],
3503 pull_constant_loc[uniform]);
3504
3505 inst->src[i].file = temp.file;
3506 inst->src[i].reg = temp.reg;
3507 inst->src[i].reg_offset = temp.reg_offset;
3508 inst->src[i].reladdr = NULL;
3509 }
3510 }
3511 } while (nested_reladdr);
3512
3513 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3514 * no need to track them as larger-than-vec4 objects. This will be
3515 * relied on in cutting out unused uniform vectors from push
3516 * constants.
3517 */
3518 split_uniform_registers();
3519 }
3520
3521 void
3522 vec4_visitor::resolve_ud_negate(src_reg *reg)
3523 {
3524 if (reg->type != BRW_REGISTER_TYPE_UD ||
3525 !reg->negate)
3526 return;
3527
3528 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3529 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3530 *reg = temp;
3531 }
3532
3533 /**
3534 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3535 *
3536 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3537 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3538 */
3539 void
3540 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3541 {
3542 assert(brw->gen <= 5);
3543
3544 if (!rvalue->type->is_boolean())
3545 return;
3546
3547 src_reg and_result = src_reg(this, rvalue->type);
3548 src_reg neg_result = src_reg(this, rvalue->type);
3549 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3550 emit(MOV(dst_reg(neg_result), negate(and_result)));
3551 *reg = neg_result;
3552 }
3553
3554 vec4_visitor::vec4_visitor(struct brw_context *brw,
3555 struct brw_vec4_compile *c,
3556 struct gl_program *prog,
3557 const struct brw_vue_prog_key *key,
3558 struct brw_vue_prog_data *prog_data,
3559 struct gl_shader_program *shader_prog,
3560 gl_shader_stage stage,
3561 void *mem_ctx,
3562 bool debug_flag,
3563 bool no_spills,
3564 shader_time_shader_type st_base,
3565 shader_time_shader_type st_written,
3566 shader_time_shader_type st_reset)
3567 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3568 c(c),
3569 key(key),
3570 prog_data(prog_data),
3571 sanity_param_count(0),
3572 fail_msg(NULL),
3573 first_non_payload_grf(0),
3574 need_all_constants_in_pull_buffer(false),
3575 debug_flag(debug_flag),
3576 no_spills(no_spills),
3577 st_base(st_base),
3578 st_written(st_written),
3579 st_reset(st_reset)
3580 {
3581 this->mem_ctx = mem_ctx;
3582 this->failed = false;
3583
3584 this->base_ir = NULL;
3585 this->current_annotation = NULL;
3586 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3587
3588 this->variable_ht = hash_table_ctor(0,
3589 hash_table_pointer_hash,
3590 hash_table_pointer_compare);
3591
3592 this->virtual_grf_start = NULL;
3593 this->virtual_grf_end = NULL;
3594 this->live_intervals = NULL;
3595
3596 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3597
3598 this->uniforms = 0;
3599
3600 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3601 * at least one. See setup_uniforms() in brw_vec4.cpp.
3602 */
3603 this->uniform_array_size = 1;
3604 if (prog_data) {
3605 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3606 }
3607
3608 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3609 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3610 }
3611
3612 vec4_visitor::~vec4_visitor()
3613 {
3614 hash_table_dtor(this->variable_ht);
3615 }
3616
3617
3618 void
3619 vec4_visitor::fail(const char *format, ...)
3620 {
3621 va_list va;
3622 char *msg;
3623
3624 if (failed)
3625 return;
3626
3627 failed = true;
3628
3629 va_start(va, format);
3630 msg = ralloc_vasprintf(mem_ctx, format, va);
3631 va_end(va);
3632 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3633
3634 this->fail_msg = msg;
3635
3636 if (debug_flag) {
3637 fprintf(stderr, "%s", msg);
3638 }
3639 }
3640
3641 } /* namespace brw */