i965/vec4: Add a helper function to emit VS_OPCODE_PULL_CONSTANT_LOAD
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(brw->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (brw->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (brw->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (brw->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (strncmp(ir->name, storage->name, namelen) != 0 ||
690 (storage->name[namelen] != 0 &&
691 storage->name[namelen] != '.' &&
692 storage->name[namelen] != '[')) {
693 continue;
694 }
695
696 gl_constant_value *components = storage->storage;
697 unsigned vector_count = (MAX2(storage->array_elements, 1) *
698 storage->type->matrix_columns);
699
700 for (unsigned s = 0; s < vector_count; s++) {
701 assert(uniforms < uniform_array_size);
702 uniform_vector_size[uniforms] = storage->type->vector_elements;
703
704 int i;
705 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
706 stage_prog_data->param[uniforms * 4 + i] = components;
707 components++;
708 }
709 for (; i < 4; i++) {
710 static gl_constant_value zero = { 0.0 };
711 stage_prog_data->param[uniforms * 4 + i] = &zero;
712 }
713
714 uniforms++;
715 }
716 }
717 }
718
719 void
720 vec4_visitor::setup_uniform_clipplane_values()
721 {
722 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
723
724 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 4;
727 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
728 this->userplane[i].type = BRW_REGISTER_TYPE_F;
729 for (int j = 0; j < 4; ++j) {
730 stage_prog_data->param[this->uniforms * 4 + j] =
731 (gl_constant_value *) &clip_planes[i][j];
732 }
733 ++this->uniforms;
734 }
735 }
736
737 /* Our support for builtin uniforms is even scarier than non-builtin.
738 * It sits on top of the PROG_STATE_VAR parameters that are
739 * automatically updated from GL context state.
740 */
741 void
742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
743 {
744 const ir_state_slot *const slots = ir->get_state_slots();
745 assert(slots != NULL);
746
747 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
748 /* This state reference has already been setup by ir_to_mesa,
749 * but we'll get the same index back here. We can reference
750 * ParameterValues directly, since unlike brw_fs.cpp, we never
751 * add new state references during compile.
752 */
753 int index = _mesa_add_state_reference(this->prog->Parameters,
754 (gl_state_index *)slots[i].tokens);
755 gl_constant_value *values =
756 &this->prog->Parameters->ParameterValues[index][0];
757
758 assert(this->uniforms < uniform_array_size);
759
760 for (unsigned j = 0; j < 4; j++)
761 stage_prog_data->param[this->uniforms * 4 + j] =
762 &values[GET_SWZ(slots[i].swizzle, j)];
763
764 this->uniform_vector_size[this->uniforms] =
765 (ir->type->is_scalar() || ir->type->is_vector() ||
766 ir->type->is_matrix() ? ir->type->vector_elements : 4);
767
768 this->uniforms++;
769 }
770 }
771
772 dst_reg *
773 vec4_visitor::variable_storage(ir_variable *var)
774 {
775 return (dst_reg *)hash_table_find(this->variable_ht, var);
776 }
777
778 void
779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
780 enum brw_predicate *predicate)
781 {
782 ir_expression *expr = ir->as_expression();
783
784 *predicate = BRW_PREDICATE_NORMAL;
785
786 if (expr && expr->operation != ir_binop_ubo_load) {
787 src_reg op[3];
788 vec4_instruction *inst;
789
790 assert(expr->get_num_operands() <= 3);
791 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
792 expr->operands[i]->accept(this);
793 op[i] = this->result;
794
795 resolve_ud_negate(&op[i]);
796 }
797
798 switch (expr->operation) {
799 case ir_unop_logic_not:
800 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
801 inst->conditional_mod = BRW_CONDITIONAL_Z;
802 break;
803
804 case ir_binop_logic_xor:
805 if (brw->gen <= 5) {
806 src_reg temp = src_reg(this, ir->type);
807 emit(XOR(dst_reg(temp), op[0], op[1]));
808 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
809 } else {
810 inst = emit(XOR(dst_null_d(), op[0], op[1]));
811 }
812 inst->conditional_mod = BRW_CONDITIONAL_NZ;
813 break;
814
815 case ir_binop_logic_or:
816 if (brw->gen <= 5) {
817 src_reg temp = src_reg(this, ir->type);
818 emit(OR(dst_reg(temp), op[0], op[1]));
819 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
820 } else {
821 inst = emit(OR(dst_null_d(), op[0], op[1]));
822 }
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_binop_logic_and:
827 if (brw->gen <= 5) {
828 src_reg temp = src_reg(this, ir->type);
829 emit(AND(dst_reg(temp), op[0], op[1]));
830 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
831 } else {
832 inst = emit(AND(dst_null_d(), op[0], op[1]));
833 }
834 inst->conditional_mod = BRW_CONDITIONAL_NZ;
835 break;
836
837 case ir_unop_f2b:
838 if (brw->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_f(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_unop_i2b:
847 if (brw->gen >= 6) {
848 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
849 } else {
850 inst = emit(MOV(dst_null_d(), op[0]));
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 }
853 break;
854
855 case ir_binop_all_equal:
856 if (brw->gen <= 5) {
857 resolve_bool_comparison(expr->operands[0], &op[0]);
858 resolve_bool_comparison(expr->operands[1], &op[1]);
859 }
860 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
861 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
862 break;
863
864 case ir_binop_any_nequal:
865 if (brw->gen <= 5) {
866 resolve_bool_comparison(expr->operands[0], &op[0]);
867 resolve_bool_comparison(expr->operands[1], &op[1]);
868 }
869 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
870 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
871 break;
872
873 case ir_unop_any:
874 if (brw->gen <= 5) {
875 resolve_bool_comparison(expr->operands[0], &op[0]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
878 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
879 break;
880
881 case ir_binop_greater:
882 case ir_binop_gequal:
883 case ir_binop_less:
884 case ir_binop_lequal:
885 case ir_binop_equal:
886 case ir_binop_nequal:
887 if (brw->gen <= 5) {
888 resolve_bool_comparison(expr->operands[0], &op[0]);
889 resolve_bool_comparison(expr->operands[1], &op[1]);
890 }
891 emit(CMP(dst_null_d(), op[0], op[1],
892 brw_conditional_for_comparison(expr->operation)));
893 break;
894
895 case ir_triop_csel: {
896 /* Expand the boolean condition into the flag register. */
897 inst = emit(MOV(dst_null_d(), op[0]));
898 inst->conditional_mod = BRW_CONDITIONAL_NZ;
899
900 /* Select which boolean to return. */
901 dst_reg temp(this, expr->operands[1]->type);
902 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
903 inst->predicate = BRW_PREDICATE_NORMAL;
904
905 /* Expand the result to a condition code. */
906 inst = emit(MOV(dst_null_d(), src_reg(temp)));
907 inst->conditional_mod = BRW_CONDITIONAL_NZ;
908 break;
909 }
910
911 default:
912 unreachable("not reached");
913 }
914 return;
915 }
916
917 ir->accept(this);
918
919 resolve_ud_negate(&this->result);
920
921 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923 }
924
925 /**
926 * Emit a gen6 IF statement with the comparison folded into the IF
927 * instruction.
928 */
929 void
930 vec4_visitor::emit_if_gen6(ir_if *ir)
931 {
932 ir_expression *expr = ir->condition->as_expression();
933
934 if (expr && expr->operation != ir_binop_ubo_load) {
935 src_reg op[3];
936 dst_reg temp;
937
938 assert(expr->get_num_operands() <= 3);
939 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
940 expr->operands[i]->accept(this);
941 op[i] = this->result;
942 }
943
944 switch (expr->operation) {
945 case ir_unop_logic_not:
946 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
947 return;
948
949 case ir_binop_logic_xor:
950 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
951 return;
952
953 case ir_binop_logic_or:
954 temp = dst_reg(this, glsl_type::bool_type);
955 emit(OR(temp, op[0], op[1]));
956 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958
959 case ir_binop_logic_and:
960 temp = dst_reg(this, glsl_type::bool_type);
961 emit(AND(temp, op[0], op[1]));
962 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
963 return;
964
965 case ir_unop_f2b:
966 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_i2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_binop_greater:
974 case ir_binop_gequal:
975 case ir_binop_less:
976 case ir_binop_lequal:
977 case ir_binop_equal:
978 case ir_binop_nequal:
979 emit(IF(op[0], op[1],
980 brw_conditional_for_comparison(expr->operation)));
981 return;
982
983 case ir_binop_all_equal:
984 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
985 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
986 return;
987
988 case ir_binop_any_nequal:
989 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
990 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
991 return;
992
993 case ir_unop_any:
994 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
995 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
996 return;
997
998 case ir_triop_csel: {
999 /* Expand the boolean condition into the flag register. */
1000 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003 /* Select which boolean to return. */
1004 dst_reg temp(this, expr->operands[1]->type);
1005 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006 inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009 return;
1010 }
1011
1012 default:
1013 unreachable("not reached");
1014 }
1015 return;
1016 }
1017
1018 ir->condition->accept(this);
1019
1020 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026 dst_reg *reg = NULL;
1027
1028 if (variable_storage(ir))
1029 return;
1030
1031 switch (ir->data.mode) {
1032 case ir_var_shader_in:
1033 assert(ir->data.location != -1);
1034 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035 break;
1036
1037 case ir_var_shader_out:
1038 assert(ir->data.location != -1);
1039 reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041 for (int i = 0; i < type_size(ir->type); i++) {
1042 output_reg[ir->data.location + i] = *reg;
1043 output_reg[ir->data.location + i].reg_offset = i;
1044 output_reg[ir->data.location + i].type =
1045 brw_type_for_base_type(ir->type->get_scalar_type());
1046 output_reg_annotation[ir->data.location + i] = ir->name;
1047 }
1048 break;
1049
1050 case ir_var_auto:
1051 case ir_var_temporary:
1052 reg = new(mem_ctx) dst_reg(this, ir->type);
1053 break;
1054
1055 case ir_var_uniform:
1056 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058 /* Thanks to the lower_ubo_reference pass, we will see only
1059 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060 * variables, so no need for them to be in variable_ht.
1061 *
1062 * Some uniforms, such as samplers and atomic counters, have no actual
1063 * storage, so we should ignore them.
1064 */
1065 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066 return;
1067
1068 /* Track how big the whole uniform variable is, in case we need to put a
1069 * copy of its data into pull constants for array access.
1070 */
1071 assert(this->uniforms < uniform_array_size);
1072 this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074 if (!strncmp(ir->name, "gl_", 3)) {
1075 setup_builtin_uniform_values(ir);
1076 } else {
1077 setup_uniform_values(ir);
1078 }
1079 break;
1080
1081 case ir_var_system_value:
1082 reg = make_reg_for_system_value(ir);
1083 break;
1084
1085 default:
1086 unreachable("not reached");
1087 }
1088
1089 reg->type = brw_type_for_base_type(ir->type);
1090 hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096 /* We don't want debugging output to print the whole body of the
1097 * loop as the annotation.
1098 */
1099 this->base_ir = NULL;
1100
1101 emit(BRW_OPCODE_DO);
1102
1103 visit_instructions(&ir->body_instructions);
1104
1105 emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111 switch (ir->mode) {
1112 case ir_loop_jump::jump_break:
1113 emit(BRW_OPCODE_BREAK);
1114 break;
1115 case ir_loop_jump::jump_continue:
1116 emit(BRW_OPCODE_CONTINUE);
1117 break;
1118 }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125 unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131 /* Ignore function bodies other than main() -- we shouldn't see calls to
1132 * them since they should all be inlined.
1133 */
1134 if (strcmp(ir->name, "main") == 0) {
1135 const ir_function_signature *sig;
1136 exec_list empty;
1137
1138 sig = ir->matching_signature(NULL, &empty, false);
1139
1140 assert(sig);
1141
1142 visit_instructions(&sig->body);
1143 }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149 /* 3-src instructions were introduced in gen6. */
1150 if (brw->gen < 6)
1151 return false;
1152
1153 /* MAD can only handle floating-point data. */
1154 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155 return false;
1156
1157 ir_rvalue *nonmul;
1158 ir_expression *mul;
1159 bool mul_negate, mul_abs;
1160
1161 for (int i = 0; i < 2; i++) {
1162 mul_negate = false;
1163 mul_abs = false;
1164
1165 mul = ir->operands[i]->as_expression();
1166 nonmul = ir->operands[1 - i];
1167
1168 if (mul && mul->operation == ir_unop_abs) {
1169 mul = mul->operands[0]->as_expression();
1170 mul_abs = true;
1171 } else if (mul && mul->operation == ir_unop_neg) {
1172 mul = mul->operands[0]->as_expression();
1173 mul_negate = true;
1174 }
1175
1176 if (mul && mul->operation == ir_binop_mul)
1177 break;
1178 }
1179
1180 if (!mul || mul->operation != ir_binop_mul)
1181 return false;
1182
1183 nonmul->accept(this);
1184 src_reg src0 = fix_3src_operand(this->result);
1185
1186 mul->operands[0]->accept(this);
1187 src_reg src1 = fix_3src_operand(this->result);
1188 src1.negate ^= mul_negate;
1189 src1.abs = mul_abs;
1190 if (mul_abs)
1191 src1.negate = false;
1192
1193 mul->operands[1]->accept(this);
1194 src_reg src2 = fix_3src_operand(this->result);
1195 src2.abs = mul_abs;
1196 if (mul_abs)
1197 src2.negate = false;
1198
1199 this->result = src_reg(this, ir->type);
1200 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202 return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208 /* This optimization relies on CMP setting the destination to 0 when
1209 * false. Early hardware only sets the least significant bit, and
1210 * leaves the other bits undefined. So we can't use it.
1211 */
1212 if (brw->gen < 6)
1213 return false;
1214
1215 ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217 if (cmp == NULL)
1218 return false;
1219
1220 switch (cmp->operation) {
1221 case ir_binop_less:
1222 case ir_binop_greater:
1223 case ir_binop_lequal:
1224 case ir_binop_gequal:
1225 case ir_binop_equal:
1226 case ir_binop_nequal:
1227 break;
1228
1229 default:
1230 return false;
1231 }
1232
1233 cmp->operands[0]->accept(this);
1234 const src_reg cmp_src0 = this->result;
1235
1236 cmp->operands[1]->accept(this);
1237 const src_reg cmp_src1 = this->result;
1238
1239 this->result = src_reg(this, ir->type);
1240
1241 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242 brw_conditional_for_comparison(cmp->operation)));
1243
1244 /* If the comparison is false, this->result will just happen to be zero.
1245 */
1246 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247 this->result, src_reg(1.0f));
1248 inst->predicate = BRW_PREDICATE_NORMAL;
1249 inst->predicate_inverse = true;
1250
1251 return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256 src_reg src0, src_reg src1)
1257 {
1258 vec4_instruction *inst;
1259
1260 if (brw->gen >= 6) {
1261 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262 inst->conditional_mod = conditionalmod;
1263 } else {
1264 emit(CMP(dst, src0, src1, conditionalmod));
1265
1266 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267 inst->predicate = BRW_PREDICATE_NORMAL;
1268 }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273 const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275 if (brw->gen >= 6) {
1276 /* Note that the instruction's argument order is reversed from GLSL
1277 * and the IR.
1278 */
1279 emit(LRP(dst,
1280 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281 } else {
1282 /* Earlier generations don't support three source operations, so we
1283 * need to emit x*(1-a) + y*a.
1284 */
1285 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1286 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1287 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288 y_times_a.writemask = dst.writemask;
1289 one_minus_a.writemask = dst.writemask;
1290 x_times_one_minus_a.writemask = dst.writemask;
1291
1292 emit(MUL(y_times_a, y, a));
1293 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296 }
1297 }
1298
1299 /**
1300 * Emits the instructions needed to perform a pull constant load. before_block
1301 * and before_inst can be NULL in which case the instruction will be appended
1302 * to the end of the instruction list.
1303 */
1304 void
1305 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1306 src_reg surf_index,
1307 src_reg offset_reg,
1308 bblock_t *before_block,
1309 vec4_instruction *before_inst)
1310 {
1311 assert((before_inst == NULL && before_block == NULL) ||
1312 (before_inst && before_block));
1313
1314 vec4_instruction *pull;
1315
1316 if (brw->gen >= 7) {
1317 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1318
1319 /* We have to use a message header on Skylake to get SIMD4x2 mode.
1320 * Reserve space for the register.
1321 */
1322 if (brw->gen >= 9) {
1323 grf_offset.reg_offset++;
1324 alloc.sizes[grf_offset.reg] = 2;
1325 }
1326
1327 grf_offset.type = offset_reg.type;
1328
1329 pull = MOV(grf_offset, offset_reg);
1330
1331 if (before_inst)
1332 emit_before(before_block, before_inst, pull);
1333 else
1334 emit(pull);
1335
1336 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1337 dst,
1338 surf_index,
1339 src_reg(grf_offset));
1340 pull->mlen = 1;
1341 } else {
1342 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1343 dst,
1344 surf_index,
1345 offset_reg);
1346 pull->base_mrf = 14;
1347 pull->mlen = 1;
1348 }
1349
1350 if (before_inst)
1351 emit_before(before_block, before_inst, pull);
1352 else
1353 emit(pull);
1354 }
1355
1356 void
1357 vec4_visitor::visit(ir_expression *ir)
1358 {
1359 unsigned int operand;
1360 src_reg op[ARRAY_SIZE(ir->operands)];
1361 vec4_instruction *inst;
1362
1363 if (ir->operation == ir_binop_add) {
1364 if (try_emit_mad(ir))
1365 return;
1366 }
1367
1368 if (ir->operation == ir_unop_b2f) {
1369 if (try_emit_b2f_of_compare(ir))
1370 return;
1371 }
1372
1373 /* Storage for our result. Ideally for an assignment we'd be using
1374 * the actual storage for the result here, instead.
1375 */
1376 dst_reg result_dst(this, ir->type);
1377 src_reg result_src(result_dst);
1378
1379 if (ir->operation == ir_triop_csel) {
1380 ir->operands[1]->accept(this);
1381 op[1] = this->result;
1382 ir->operands[2]->accept(this);
1383 op[2] = this->result;
1384
1385 enum brw_predicate predicate;
1386 emit_bool_to_cond_code(ir->operands[0], &predicate);
1387 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1388 inst->predicate = predicate;
1389 this->result = result_src;
1390 return;
1391 }
1392
1393 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1394 this->result.file = BAD_FILE;
1395 ir->operands[operand]->accept(this);
1396 if (this->result.file == BAD_FILE) {
1397 fprintf(stderr, "Failed to get tree for expression operand:\n");
1398 ir->operands[operand]->fprint(stderr);
1399 exit(1);
1400 }
1401 op[operand] = this->result;
1402
1403 /* Matrix expression operands should have been broken down to vector
1404 * operations already.
1405 */
1406 assert(!ir->operands[operand]->type->is_matrix());
1407 }
1408
1409 /* If nothing special happens, this is the result. */
1410 this->result = result_src;
1411
1412 switch (ir->operation) {
1413 case ir_unop_logic_not:
1414 emit(NOT(result_dst, op[0]));
1415 break;
1416 case ir_unop_neg:
1417 op[0].negate = !op[0].negate;
1418 emit(MOV(result_dst, op[0]));
1419 break;
1420 case ir_unop_abs:
1421 op[0].abs = true;
1422 op[0].negate = false;
1423 emit(MOV(result_dst, op[0]));
1424 break;
1425
1426 case ir_unop_sign:
1427 if (ir->type->is_float()) {
1428 /* AND(val, 0x80000000) gives the sign bit.
1429 *
1430 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1431 * zero.
1432 */
1433 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1434
1435 op[0].type = BRW_REGISTER_TYPE_UD;
1436 result_dst.type = BRW_REGISTER_TYPE_UD;
1437 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1438
1439 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1440 inst->predicate = BRW_PREDICATE_NORMAL;
1441
1442 this->result.type = BRW_REGISTER_TYPE_F;
1443 } else {
1444 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1445 * -> non-negative val generates 0x00000000.
1446 * Predicated OR sets 1 if val is positive.
1447 */
1448 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1449
1450 emit(ASR(result_dst, op[0], src_reg(31)));
1451
1452 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1453 inst->predicate = BRW_PREDICATE_NORMAL;
1454 }
1455 break;
1456
1457 case ir_unop_rcp:
1458 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1459 break;
1460
1461 case ir_unop_exp2:
1462 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1463 break;
1464 case ir_unop_log2:
1465 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1466 break;
1467 case ir_unop_exp:
1468 case ir_unop_log:
1469 unreachable("not reached: should be handled by ir_explog_to_explog2");
1470 case ir_unop_sin:
1471 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1472 break;
1473 case ir_unop_cos:
1474 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1475 break;
1476
1477 case ir_unop_dFdx:
1478 case ir_unop_dFdx_coarse:
1479 case ir_unop_dFdx_fine:
1480 case ir_unop_dFdy:
1481 case ir_unop_dFdy_coarse:
1482 case ir_unop_dFdy_fine:
1483 unreachable("derivatives not valid in vertex shader");
1484
1485 case ir_unop_bitfield_reverse:
1486 emit(BFREV(result_dst, op[0]));
1487 break;
1488 case ir_unop_bit_count:
1489 emit(CBIT(result_dst, op[0]));
1490 break;
1491 case ir_unop_find_msb: {
1492 src_reg temp = src_reg(this, glsl_type::uint_type);
1493
1494 inst = emit(FBH(dst_reg(temp), op[0]));
1495 inst->dst.writemask = WRITEMASK_XYZW;
1496
1497 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1498 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1499 * subtract the result from 31 to convert the MSB count into an LSB count.
1500 */
1501
1502 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1503 temp.swizzle = BRW_SWIZZLE_NOOP;
1504 emit(MOV(result_dst, temp));
1505
1506 src_reg src_tmp = src_reg(result_dst);
1507 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1508
1509 src_tmp.negate = true;
1510 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1511 inst->predicate = BRW_PREDICATE_NORMAL;
1512 break;
1513 }
1514 case ir_unop_find_lsb:
1515 emit(FBL(result_dst, op[0]));
1516 break;
1517 case ir_unop_saturate:
1518 inst = emit(MOV(result_dst, op[0]));
1519 inst->saturate = true;
1520 break;
1521
1522 case ir_unop_noise:
1523 unreachable("not reached: should be handled by lower_noise");
1524
1525 case ir_binop_add:
1526 emit(ADD(result_dst, op[0], op[1]));
1527 break;
1528 case ir_binop_sub:
1529 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1530
1531 case ir_binop_mul:
1532 if (brw->gen < 8 && ir->type->is_integer()) {
1533 /* For integer multiplication, the MUL uses the low 16 bits of one of
1534 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1535 * accumulates in the contribution of the upper 16 bits of that
1536 * operand. If we can determine that one of the args is in the low
1537 * 16 bits, though, we can just emit a single MUL.
1538 */
1539 if (ir->operands[0]->is_uint16_constant()) {
1540 if (brw->gen < 7)
1541 emit(MUL(result_dst, op[0], op[1]));
1542 else
1543 emit(MUL(result_dst, op[1], op[0]));
1544 } else if (ir->operands[1]->is_uint16_constant()) {
1545 if (brw->gen < 7)
1546 emit(MUL(result_dst, op[1], op[0]));
1547 else
1548 emit(MUL(result_dst, op[0], op[1]));
1549 } else {
1550 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1551
1552 emit(MUL(acc, op[0], op[1]));
1553 emit(MACH(dst_null_d(), op[0], op[1]));
1554 emit(MOV(result_dst, src_reg(acc)));
1555 }
1556 } else {
1557 emit(MUL(result_dst, op[0], op[1]));
1558 }
1559 break;
1560 case ir_binop_imul_high: {
1561 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1562
1563 emit(MUL(acc, op[0], op[1]));
1564 emit(MACH(result_dst, op[0], op[1]));
1565 break;
1566 }
1567 case ir_binop_div:
1568 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1569 assert(ir->type->is_integer());
1570 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1571 break;
1572 case ir_binop_carry: {
1573 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1574
1575 emit(ADDC(dst_null_ud(), op[0], op[1]));
1576 emit(MOV(result_dst, src_reg(acc)));
1577 break;
1578 }
1579 case ir_binop_borrow: {
1580 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1581
1582 emit(SUBB(dst_null_ud(), op[0], op[1]));
1583 emit(MOV(result_dst, src_reg(acc)));
1584 break;
1585 }
1586 case ir_binop_mod:
1587 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1588 assert(ir->type->is_integer());
1589 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1590 break;
1591
1592 case ir_binop_less:
1593 case ir_binop_greater:
1594 case ir_binop_lequal:
1595 case ir_binop_gequal:
1596 case ir_binop_equal:
1597 case ir_binop_nequal: {
1598 if (brw->gen <= 5) {
1599 resolve_bool_comparison(ir->operands[0], &op[0]);
1600 resolve_bool_comparison(ir->operands[1], &op[1]);
1601 }
1602 emit(CMP(result_dst, op[0], op[1],
1603 brw_conditional_for_comparison(ir->operation)));
1604 break;
1605 }
1606
1607 case ir_binop_all_equal:
1608 if (brw->gen <= 5) {
1609 resolve_bool_comparison(ir->operands[0], &op[0]);
1610 resolve_bool_comparison(ir->operands[1], &op[1]);
1611 }
1612
1613 /* "==" operator producing a scalar boolean. */
1614 if (ir->operands[0]->type->is_vector() ||
1615 ir->operands[1]->type->is_vector()) {
1616 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1617 emit(MOV(result_dst, src_reg(0)));
1618 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1619 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1620 } else {
1621 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1622 }
1623 break;
1624 case ir_binop_any_nequal:
1625 if (brw->gen <= 5) {
1626 resolve_bool_comparison(ir->operands[0], &op[0]);
1627 resolve_bool_comparison(ir->operands[1], &op[1]);
1628 }
1629
1630 /* "!=" operator producing a scalar boolean. */
1631 if (ir->operands[0]->type->is_vector() ||
1632 ir->operands[1]->type->is_vector()) {
1633 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1634
1635 emit(MOV(result_dst, src_reg(0)));
1636 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1637 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1638 } else {
1639 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1640 }
1641 break;
1642
1643 case ir_unop_any:
1644 if (brw->gen <= 5) {
1645 resolve_bool_comparison(ir->operands[0], &op[0]);
1646 }
1647 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1648 emit(MOV(result_dst, src_reg(0)));
1649
1650 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1651 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1652 break;
1653
1654 case ir_binop_logic_xor:
1655 emit(XOR(result_dst, op[0], op[1]));
1656 break;
1657
1658 case ir_binop_logic_or:
1659 emit(OR(result_dst, op[0], op[1]));
1660 break;
1661
1662 case ir_binop_logic_and:
1663 emit(AND(result_dst, op[0], op[1]));
1664 break;
1665
1666 case ir_binop_dot:
1667 assert(ir->operands[0]->type->is_vector());
1668 assert(ir->operands[0]->type == ir->operands[1]->type);
1669 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1670 break;
1671
1672 case ir_unop_sqrt:
1673 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1674 break;
1675 case ir_unop_rsq:
1676 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1677 break;
1678
1679 case ir_unop_bitcast_i2f:
1680 case ir_unop_bitcast_u2f:
1681 this->result = op[0];
1682 this->result.type = BRW_REGISTER_TYPE_F;
1683 break;
1684
1685 case ir_unop_bitcast_f2i:
1686 this->result = op[0];
1687 this->result.type = BRW_REGISTER_TYPE_D;
1688 break;
1689
1690 case ir_unop_bitcast_f2u:
1691 this->result = op[0];
1692 this->result.type = BRW_REGISTER_TYPE_UD;
1693 break;
1694
1695 case ir_unop_i2f:
1696 case ir_unop_i2u:
1697 case ir_unop_u2i:
1698 case ir_unop_u2f:
1699 case ir_unop_f2i:
1700 case ir_unop_f2u:
1701 emit(MOV(result_dst, op[0]));
1702 break;
1703 case ir_unop_b2i:
1704 emit(AND(result_dst, op[0], src_reg(1)));
1705 break;
1706 case ir_unop_b2f:
1707 if (brw->gen <= 5) {
1708 resolve_bool_comparison(ir->operands[0], &op[0]);
1709 }
1710 op[0].type = BRW_REGISTER_TYPE_D;
1711 result_dst.type = BRW_REGISTER_TYPE_D;
1712 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1713 result_dst.type = BRW_REGISTER_TYPE_F;
1714 break;
1715 case ir_unop_f2b:
1716 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1717 break;
1718 case ir_unop_i2b:
1719 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1720 break;
1721
1722 case ir_unop_trunc:
1723 emit(RNDZ(result_dst, op[0]));
1724 break;
1725 case ir_unop_ceil: {
1726 src_reg tmp = src_reg(this, ir->type);
1727 op[0].negate = !op[0].negate;
1728 emit(RNDD(dst_reg(tmp), op[0]));
1729 tmp.negate = true;
1730 emit(MOV(result_dst, tmp));
1731 }
1732 break;
1733 case ir_unop_floor:
1734 inst = emit(RNDD(result_dst, op[0]));
1735 break;
1736 case ir_unop_fract:
1737 inst = emit(FRC(result_dst, op[0]));
1738 break;
1739 case ir_unop_round_even:
1740 emit(RNDE(result_dst, op[0]));
1741 break;
1742
1743 case ir_binop_min:
1744 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1745 break;
1746 case ir_binop_max:
1747 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1748 break;
1749
1750 case ir_binop_pow:
1751 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1752 break;
1753
1754 case ir_unop_bit_not:
1755 inst = emit(NOT(result_dst, op[0]));
1756 break;
1757 case ir_binop_bit_and:
1758 inst = emit(AND(result_dst, op[0], op[1]));
1759 break;
1760 case ir_binop_bit_xor:
1761 inst = emit(XOR(result_dst, op[0], op[1]));
1762 break;
1763 case ir_binop_bit_or:
1764 inst = emit(OR(result_dst, op[0], op[1]));
1765 break;
1766
1767 case ir_binop_lshift:
1768 inst = emit(SHL(result_dst, op[0], op[1]));
1769 break;
1770
1771 case ir_binop_rshift:
1772 if (ir->type->base_type == GLSL_TYPE_INT)
1773 inst = emit(ASR(result_dst, op[0], op[1]));
1774 else
1775 inst = emit(SHR(result_dst, op[0], op[1]));
1776 break;
1777
1778 case ir_binop_bfm:
1779 emit(BFI1(result_dst, op[0], op[1]));
1780 break;
1781
1782 case ir_binop_ubo_load: {
1783 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1784 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1785 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1786 src_reg offset;
1787
1788 /* Now, load the vector from that offset. */
1789 assert(ir->type->is_vector() || ir->type->is_scalar());
1790
1791 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1792 packed_consts.type = result.type;
1793 src_reg surf_index;
1794
1795 if (const_uniform_block) {
1796 /* The block index is a constant, so just emit the binding table entry
1797 * as an immediate.
1798 */
1799 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1800 const_uniform_block->value.u[0]);
1801 } else {
1802 /* The block index is not a constant. Evaluate the index expression
1803 * per-channel and add the base UBO index; the generator will select
1804 * a value from any live channel.
1805 */
1806 surf_index = src_reg(this, glsl_type::uint_type);
1807 emit(ADD(dst_reg(surf_index), op[0],
1808 src_reg(prog_data->base.binding_table.ubo_start)));
1809
1810 /* Assume this may touch any UBO. It would be nice to provide
1811 * a tighter bound, but the array information is already lowered away.
1812 */
1813 brw_mark_surface_used(&prog_data->base,
1814 prog_data->base.binding_table.ubo_start +
1815 shader_prog->NumUniformBlocks - 1);
1816 }
1817
1818 if (const_offset_ir) {
1819 if (brw->gen >= 8) {
1820 /* Store the offset in a GRF so we can send-from-GRF. */
1821 offset = src_reg(this, glsl_type::int_type);
1822 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1823 } else {
1824 /* Immediates are fine on older generations since they'll be moved
1825 * to a (potentially fake) MRF at the generator level.
1826 */
1827 offset = src_reg(const_offset / 16);
1828 }
1829 } else {
1830 offset = src_reg(this, glsl_type::uint_type);
1831 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1832 }
1833
1834 emit_pull_constant_load_reg(dst_reg(packed_consts),
1835 surf_index,
1836 offset,
1837 NULL, NULL /* before_block/inst */);
1838
1839 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1840 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1841 const_offset % 16 / 4,
1842 const_offset % 16 / 4,
1843 const_offset % 16 / 4);
1844
1845 /* UBO bools are any nonzero int. We need to convert them to use the
1846 * value of true stored in ctx->Const.UniformBooleanTrue.
1847 */
1848 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1849 emit(CMP(result_dst, packed_consts, src_reg(0u),
1850 BRW_CONDITIONAL_NZ));
1851 } else {
1852 emit(MOV(result_dst, packed_consts));
1853 }
1854 break;
1855 }
1856
1857 case ir_binop_vector_extract:
1858 unreachable("should have been lowered by vec_index_to_cond_assign");
1859
1860 case ir_triop_fma:
1861 op[0] = fix_3src_operand(op[0]);
1862 op[1] = fix_3src_operand(op[1]);
1863 op[2] = fix_3src_operand(op[2]);
1864 /* Note that the instruction's argument order is reversed from GLSL
1865 * and the IR.
1866 */
1867 emit(MAD(result_dst, op[2], op[1], op[0]));
1868 break;
1869
1870 case ir_triop_lrp:
1871 emit_lrp(result_dst, op[0], op[1], op[2]);
1872 break;
1873
1874 case ir_triop_csel:
1875 unreachable("already handled above");
1876 break;
1877
1878 case ir_triop_bfi:
1879 op[0] = fix_3src_operand(op[0]);
1880 op[1] = fix_3src_operand(op[1]);
1881 op[2] = fix_3src_operand(op[2]);
1882 emit(BFI2(result_dst, op[0], op[1], op[2]));
1883 break;
1884
1885 case ir_triop_bitfield_extract:
1886 op[0] = fix_3src_operand(op[0]);
1887 op[1] = fix_3src_operand(op[1]);
1888 op[2] = fix_3src_operand(op[2]);
1889 /* Note that the instruction's argument order is reversed from GLSL
1890 * and the IR.
1891 */
1892 emit(BFE(result_dst, op[2], op[1], op[0]));
1893 break;
1894
1895 case ir_triop_vector_insert:
1896 unreachable("should have been lowered by lower_vector_insert");
1897
1898 case ir_quadop_bitfield_insert:
1899 unreachable("not reached: should be handled by "
1900 "bitfield_insert_to_bfm_bfi\n");
1901
1902 case ir_quadop_vector:
1903 unreachable("not reached: should be handled by lower_quadop_vector");
1904
1905 case ir_unop_pack_half_2x16:
1906 emit_pack_half_2x16(result_dst, op[0]);
1907 break;
1908 case ir_unop_unpack_half_2x16:
1909 emit_unpack_half_2x16(result_dst, op[0]);
1910 break;
1911 case ir_unop_unpack_unorm_4x8:
1912 emit_unpack_unorm_4x8(result_dst, op[0]);
1913 break;
1914 case ir_unop_unpack_snorm_4x8:
1915 emit_unpack_snorm_4x8(result_dst, op[0]);
1916 break;
1917 case ir_unop_pack_unorm_4x8:
1918 emit_pack_unorm_4x8(result_dst, op[0]);
1919 break;
1920 case ir_unop_pack_snorm_4x8:
1921 emit_pack_snorm_4x8(result_dst, op[0]);
1922 break;
1923 case ir_unop_pack_snorm_2x16:
1924 case ir_unop_pack_unorm_2x16:
1925 case ir_unop_unpack_snorm_2x16:
1926 case ir_unop_unpack_unorm_2x16:
1927 unreachable("not reached: should be handled by lower_packing_builtins");
1928 case ir_unop_unpack_half_2x16_split_x:
1929 case ir_unop_unpack_half_2x16_split_y:
1930 case ir_binop_pack_half_2x16_split:
1931 case ir_unop_interpolate_at_centroid:
1932 case ir_binop_interpolate_at_sample:
1933 case ir_binop_interpolate_at_offset:
1934 unreachable("not reached: should not occur in vertex shader");
1935 case ir_binop_ldexp:
1936 unreachable("not reached: should be handled by ldexp_to_arith()");
1937 case ir_unop_d2f:
1938 case ir_unop_f2d:
1939 case ir_unop_d2i:
1940 case ir_unop_i2d:
1941 case ir_unop_d2u:
1942 case ir_unop_u2d:
1943 case ir_unop_d2b:
1944 case ir_unop_pack_double_2x32:
1945 case ir_unop_unpack_double_2x32:
1946 case ir_unop_frexp_sig:
1947 case ir_unop_frexp_exp:
1948 unreachable("fp64 todo");
1949 }
1950 }
1951
1952
1953 void
1954 vec4_visitor::visit(ir_swizzle *ir)
1955 {
1956 /* Note that this is only swizzles in expressions, not those on the left
1957 * hand side of an assignment, which do write masking. See ir_assignment
1958 * for that.
1959 */
1960 const unsigned swz = brw_compose_swizzle(
1961 brw_swizzle_for_size(ir->type->vector_elements),
1962 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1963
1964 ir->val->accept(this);
1965 this->result = swizzle(this->result, swz);
1966 }
1967
1968 void
1969 vec4_visitor::visit(ir_dereference_variable *ir)
1970 {
1971 const struct glsl_type *type = ir->type;
1972 dst_reg *reg = variable_storage(ir->var);
1973
1974 if (!reg) {
1975 fail("Failed to find variable storage for %s\n", ir->var->name);
1976 this->result = src_reg(brw_null_reg());
1977 return;
1978 }
1979
1980 this->result = src_reg(*reg);
1981
1982 /* System values get their swizzle from the dst_reg writemask */
1983 if (ir->var->data.mode == ir_var_system_value)
1984 return;
1985
1986 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1987 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
1988 }
1989
1990
1991 int
1992 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1993 {
1994 /* Under normal circumstances array elements are stored consecutively, so
1995 * the stride is equal to the size of the array element.
1996 */
1997 return type_size(ir->type);
1998 }
1999
2000
2001 void
2002 vec4_visitor::visit(ir_dereference_array *ir)
2003 {
2004 ir_constant *constant_index;
2005 src_reg src;
2006 int array_stride = compute_array_stride(ir);
2007
2008 constant_index = ir->array_index->constant_expression_value();
2009
2010 ir->array->accept(this);
2011 src = this->result;
2012
2013 if (constant_index) {
2014 src.reg_offset += constant_index->value.i[0] * array_stride;
2015 } else {
2016 /* Variable index array dereference. It eats the "vec4" of the
2017 * base of the array and an index that offsets the Mesa register
2018 * index.
2019 */
2020 ir->array_index->accept(this);
2021
2022 src_reg index_reg;
2023
2024 if (array_stride == 1) {
2025 index_reg = this->result;
2026 } else {
2027 index_reg = src_reg(this, glsl_type::int_type);
2028
2029 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2030 }
2031
2032 if (src.reladdr) {
2033 src_reg temp = src_reg(this, glsl_type::int_type);
2034
2035 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2036
2037 index_reg = temp;
2038 }
2039
2040 src.reladdr = ralloc(mem_ctx, src_reg);
2041 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2042 }
2043
2044 /* If the type is smaller than a vec4, replicate the last channel out. */
2045 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2046 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2047 else
2048 src.swizzle = BRW_SWIZZLE_NOOP;
2049 src.type = brw_type_for_base_type(ir->type);
2050
2051 this->result = src;
2052 }
2053
2054 void
2055 vec4_visitor::visit(ir_dereference_record *ir)
2056 {
2057 unsigned int i;
2058 const glsl_type *struct_type = ir->record->type;
2059 int offset = 0;
2060
2061 ir->record->accept(this);
2062
2063 for (i = 0; i < struct_type->length; i++) {
2064 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2065 break;
2066 offset += type_size(struct_type->fields.structure[i].type);
2067 }
2068
2069 /* If the type is smaller than a vec4, replicate the last channel out. */
2070 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2071 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2072 else
2073 this->result.swizzle = BRW_SWIZZLE_NOOP;
2074 this->result.type = brw_type_for_base_type(ir->type);
2075
2076 this->result.reg_offset += offset;
2077 }
2078
2079 /**
2080 * We want to be careful in assignment setup to hit the actual storage
2081 * instead of potentially using a temporary like we might with the
2082 * ir_dereference handler.
2083 */
2084 static dst_reg
2085 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2086 {
2087 /* The LHS must be a dereference. If the LHS is a variable indexed array
2088 * access of a vector, it must be separated into a series conditional moves
2089 * before reaching this point (see ir_vec_index_to_cond_assign).
2090 */
2091 assert(ir->as_dereference());
2092 ir_dereference_array *deref_array = ir->as_dereference_array();
2093 if (deref_array) {
2094 assert(!deref_array->array->type->is_vector());
2095 }
2096
2097 /* Use the rvalue deref handler for the most part. We'll ignore
2098 * swizzles in it and write swizzles using writemask, though.
2099 */
2100 ir->accept(v);
2101 return dst_reg(v->result);
2102 }
2103
2104 void
2105 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2106 const struct glsl_type *type,
2107 enum brw_predicate predicate)
2108 {
2109 if (type->base_type == GLSL_TYPE_STRUCT) {
2110 for (unsigned int i = 0; i < type->length; i++) {
2111 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2112 }
2113 return;
2114 }
2115
2116 if (type->is_array()) {
2117 for (unsigned int i = 0; i < type->length; i++) {
2118 emit_block_move(dst, src, type->fields.array, predicate);
2119 }
2120 return;
2121 }
2122
2123 if (type->is_matrix()) {
2124 const struct glsl_type *vec_type;
2125
2126 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2127 type->vector_elements, 1);
2128
2129 for (int i = 0; i < type->matrix_columns; i++) {
2130 emit_block_move(dst, src, vec_type, predicate);
2131 }
2132 return;
2133 }
2134
2135 assert(type->is_scalar() || type->is_vector());
2136
2137 dst->type = brw_type_for_base_type(type);
2138 src->type = dst->type;
2139
2140 dst->writemask = (1 << type->vector_elements) - 1;
2141
2142 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2143
2144 vec4_instruction *inst = emit(MOV(*dst, *src));
2145 inst->predicate = predicate;
2146
2147 dst->reg_offset++;
2148 src->reg_offset++;
2149 }
2150
2151
2152 /* If the RHS processing resulted in an instruction generating a
2153 * temporary value, and it would be easy to rewrite the instruction to
2154 * generate its result right into the LHS instead, do so. This ends
2155 * up reliably removing instructions where it can be tricky to do so
2156 * later without real UD chain information.
2157 */
2158 bool
2159 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2160 dst_reg dst,
2161 src_reg src,
2162 vec4_instruction *pre_rhs_inst,
2163 vec4_instruction *last_rhs_inst)
2164 {
2165 /* This could be supported, but it would take more smarts. */
2166 if (ir->condition)
2167 return false;
2168
2169 if (pre_rhs_inst == last_rhs_inst)
2170 return false; /* No instructions generated to work with. */
2171
2172 /* Make sure the last instruction generated our source reg. */
2173 if (src.file != GRF ||
2174 src.file != last_rhs_inst->dst.file ||
2175 src.reg != last_rhs_inst->dst.reg ||
2176 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2177 src.reladdr ||
2178 src.abs ||
2179 src.negate ||
2180 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2181 return false;
2182
2183 /* Check that that last instruction fully initialized the channels
2184 * we want to use, in the order we want to use them. We could
2185 * potentially reswizzle the operands of many instructions so that
2186 * we could handle out of order channels, but don't yet.
2187 */
2188
2189 for (unsigned i = 0; i < 4; i++) {
2190 if (dst.writemask & (1 << i)) {
2191 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2192 return false;
2193
2194 if (BRW_GET_SWZ(src.swizzle, i) != i)
2195 return false;
2196 }
2197 }
2198
2199 /* Success! Rewrite the instruction. */
2200 last_rhs_inst->dst.file = dst.file;
2201 last_rhs_inst->dst.reg = dst.reg;
2202 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2203 last_rhs_inst->dst.reladdr = dst.reladdr;
2204 last_rhs_inst->dst.writemask &= dst.writemask;
2205
2206 return true;
2207 }
2208
2209 void
2210 vec4_visitor::visit(ir_assignment *ir)
2211 {
2212 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2213 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2214
2215 if (!ir->lhs->type->is_scalar() &&
2216 !ir->lhs->type->is_vector()) {
2217 ir->rhs->accept(this);
2218 src_reg src = this->result;
2219
2220 if (ir->condition) {
2221 emit_bool_to_cond_code(ir->condition, &predicate);
2222 }
2223
2224 /* emit_block_move doesn't account for swizzles in the source register.
2225 * This should be ok, since the source register is a structure or an
2226 * array, and those can't be swizzled. But double-check to be sure.
2227 */
2228 assert(src.swizzle ==
2229 (ir->rhs->type->is_matrix()
2230 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2231 : BRW_SWIZZLE_NOOP));
2232
2233 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2234 return;
2235 }
2236
2237 /* Now we're down to just a scalar/vector with writemasks. */
2238 int i;
2239
2240 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2241 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2242
2243 ir->rhs->accept(this);
2244
2245 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2246
2247 int swizzles[4];
2248 int src_chan = 0;
2249
2250 assert(ir->lhs->type->is_vector() ||
2251 ir->lhs->type->is_scalar());
2252 dst.writemask = ir->write_mask;
2253
2254 /* Swizzle a small RHS vector into the channels being written.
2255 *
2256 * glsl ir treats write_mask as dictating how many channels are
2257 * present on the RHS while in our instructions we need to make
2258 * those channels appear in the slots of the vec4 they're written to.
2259 */
2260 for (int i = 0; i < 4; i++)
2261 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2262
2263 src_reg src = swizzle(this->result,
2264 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2265 swizzles[2], swizzles[3]));
2266
2267 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2268 return;
2269 }
2270
2271 if (ir->condition) {
2272 emit_bool_to_cond_code(ir->condition, &predicate);
2273 }
2274
2275 for (i = 0; i < type_size(ir->lhs->type); i++) {
2276 vec4_instruction *inst = emit(MOV(dst, src));
2277 inst->predicate = predicate;
2278
2279 dst.reg_offset++;
2280 src.reg_offset++;
2281 }
2282 }
2283
2284 void
2285 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2286 {
2287 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2288 foreach_in_list(ir_constant, field_value, &ir->components) {
2289 emit_constant_values(dst, field_value);
2290 }
2291 return;
2292 }
2293
2294 if (ir->type->is_array()) {
2295 for (unsigned int i = 0; i < ir->type->length; i++) {
2296 emit_constant_values(dst, ir->array_elements[i]);
2297 }
2298 return;
2299 }
2300
2301 if (ir->type->is_matrix()) {
2302 for (int i = 0; i < ir->type->matrix_columns; i++) {
2303 float *vec = &ir->value.f[i * ir->type->vector_elements];
2304
2305 for (int j = 0; j < ir->type->vector_elements; j++) {
2306 dst->writemask = 1 << j;
2307 dst->type = BRW_REGISTER_TYPE_F;
2308
2309 emit(MOV(*dst, src_reg(vec[j])));
2310 }
2311 dst->reg_offset++;
2312 }
2313 return;
2314 }
2315
2316 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2317
2318 for (int i = 0; i < ir->type->vector_elements; i++) {
2319 if (!(remaining_writemask & (1 << i)))
2320 continue;
2321
2322 dst->writemask = 1 << i;
2323 dst->type = brw_type_for_base_type(ir->type);
2324
2325 /* Find other components that match the one we're about to
2326 * write. Emits fewer instructions for things like vec4(0.5,
2327 * 1.5, 1.5, 1.5).
2328 */
2329 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2330 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2331 if (ir->value.b[i] == ir->value.b[j])
2332 dst->writemask |= (1 << j);
2333 } else {
2334 /* u, i, and f storage all line up, so no need for a
2335 * switch case for comparing each type.
2336 */
2337 if (ir->value.u[i] == ir->value.u[j])
2338 dst->writemask |= (1 << j);
2339 }
2340 }
2341
2342 switch (ir->type->base_type) {
2343 case GLSL_TYPE_FLOAT:
2344 emit(MOV(*dst, src_reg(ir->value.f[i])));
2345 break;
2346 case GLSL_TYPE_INT:
2347 emit(MOV(*dst, src_reg(ir->value.i[i])));
2348 break;
2349 case GLSL_TYPE_UINT:
2350 emit(MOV(*dst, src_reg(ir->value.u[i])));
2351 break;
2352 case GLSL_TYPE_BOOL:
2353 emit(MOV(*dst,
2354 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2355 : 0)));
2356 break;
2357 default:
2358 unreachable("Non-float/uint/int/bool constant");
2359 }
2360
2361 remaining_writemask &= ~dst->writemask;
2362 }
2363 dst->reg_offset++;
2364 }
2365
2366 void
2367 vec4_visitor::visit(ir_constant *ir)
2368 {
2369 dst_reg dst = dst_reg(this, ir->type);
2370 this->result = src_reg(dst);
2371
2372 emit_constant_values(&dst, ir);
2373 }
2374
2375 void
2376 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2377 {
2378 ir_dereference *deref = static_cast<ir_dereference *>(
2379 ir->actual_parameters.get_head());
2380 ir_variable *location = deref->variable_referenced();
2381 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2382 location->data.binding);
2383
2384 /* Calculate the surface offset */
2385 src_reg offset(this, glsl_type::uint_type);
2386 ir_dereference_array *deref_array = deref->as_dereference_array();
2387 if (deref_array) {
2388 deref_array->array_index->accept(this);
2389
2390 src_reg tmp(this, glsl_type::uint_type);
2391 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2392 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2393 } else {
2394 offset = location->data.atomic.offset;
2395 }
2396
2397 /* Emit the appropriate machine instruction */
2398 const char *callee = ir->callee->function_name();
2399 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2400
2401 if (!strcmp("__intrinsic_atomic_read", callee)) {
2402 emit_untyped_surface_read(surf_index, dst, offset);
2403
2404 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2405 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2406 src_reg(), src_reg());
2407
2408 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2409 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2410 src_reg(), src_reg());
2411 }
2412 }
2413
2414 void
2415 vec4_visitor::visit(ir_call *ir)
2416 {
2417 const char *callee = ir->callee->function_name();
2418
2419 if (!strcmp("__intrinsic_atomic_read", callee) ||
2420 !strcmp("__intrinsic_atomic_increment", callee) ||
2421 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2422 visit_atomic_counter_intrinsic(ir);
2423 } else {
2424 unreachable("Unsupported intrinsic.");
2425 }
2426 }
2427
2428 src_reg
2429 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2430 {
2431 vec4_instruction *inst =
2432 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2433 dst_reg(this, glsl_type::uvec4_type));
2434 inst->base_mrf = 2;
2435 inst->mlen = 1;
2436 inst->src[1] = sampler;
2437
2438 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2439 int param_base = inst->base_mrf;
2440 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2441 int zero_mask = 0xf & ~coord_mask;
2442
2443 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2444 coordinate));
2445
2446 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2447 src_reg(0)));
2448
2449 emit(inst);
2450 return src_reg(inst->dst);
2451 }
2452
2453 static bool
2454 is_high_sampler(struct brw_context *brw, src_reg sampler)
2455 {
2456 if (brw->gen < 8 && !brw->is_haswell)
2457 return false;
2458
2459 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2460 }
2461
2462 void
2463 vec4_visitor::visit(ir_texture *ir)
2464 {
2465 uint32_t sampler =
2466 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2467
2468 ir_rvalue *nonconst_sampler_index =
2469 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2470
2471 /* Handle non-constant sampler array indexing */
2472 src_reg sampler_reg;
2473 if (nonconst_sampler_index) {
2474 /* The highest sampler which may be used by this operation is
2475 * the last element of the array. Mark it here, because the generator
2476 * doesn't have enough information to determine the bound.
2477 */
2478 uint32_t array_size = ir->sampler->as_dereference_array()
2479 ->array->type->array_size();
2480
2481 uint32_t max_used = sampler + array_size - 1;
2482 if (ir->op == ir_tg4 && brw->gen < 8) {
2483 max_used += prog_data->base.binding_table.gather_texture_start;
2484 } else {
2485 max_used += prog_data->base.binding_table.texture_start;
2486 }
2487
2488 brw_mark_surface_used(&prog_data->base, max_used);
2489
2490 /* Emit code to evaluate the actual indexing expression */
2491 nonconst_sampler_index->accept(this);
2492 dst_reg temp(this, glsl_type::uint_type);
2493 emit(ADD(temp, this->result, src_reg(sampler)))
2494 ->force_writemask_all = true;
2495 sampler_reg = src_reg(temp);
2496 } else {
2497 /* Single sampler, or constant array index; the indexing expression
2498 * is just an immediate.
2499 */
2500 sampler_reg = src_reg(sampler);
2501 }
2502
2503 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2504 * emitting anything other than setting up the constant result.
2505 */
2506 if (ir->op == ir_tg4) {
2507 ir_constant *chan = ir->lod_info.component->as_constant();
2508 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2509 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2510 dst_reg result(this, ir->type);
2511 this->result = src_reg(result);
2512 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2513 return;
2514 }
2515 }
2516
2517 /* Should be lowered by do_lower_texture_projection */
2518 assert(!ir->projector);
2519
2520 /* Should be lowered */
2521 assert(!ir->offset || !ir->offset->type->is_array());
2522
2523 /* Generate code to compute all the subexpression trees. This has to be
2524 * done before loading any values into MRFs for the sampler message since
2525 * generating these values may involve SEND messages that need the MRFs.
2526 */
2527 src_reg coordinate;
2528 if (ir->coordinate) {
2529 ir->coordinate->accept(this);
2530 coordinate = this->result;
2531 }
2532
2533 src_reg shadow_comparitor;
2534 if (ir->shadow_comparitor) {
2535 ir->shadow_comparitor->accept(this);
2536 shadow_comparitor = this->result;
2537 }
2538
2539 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2540 src_reg offset_value;
2541 if (has_nonconstant_offset) {
2542 ir->offset->accept(this);
2543 offset_value = src_reg(this->result);
2544 }
2545
2546 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2547 src_reg lod, dPdx, dPdy, sample_index, mcs;
2548 switch (ir->op) {
2549 case ir_tex:
2550 lod = src_reg(0.0f);
2551 lod_type = glsl_type::float_type;
2552 break;
2553 case ir_txf:
2554 case ir_txl:
2555 case ir_txs:
2556 ir->lod_info.lod->accept(this);
2557 lod = this->result;
2558 lod_type = ir->lod_info.lod->type;
2559 break;
2560 case ir_query_levels:
2561 lod = src_reg(0);
2562 lod_type = glsl_type::int_type;
2563 break;
2564 case ir_txf_ms:
2565 ir->lod_info.sample_index->accept(this);
2566 sample_index = this->result;
2567 sample_index_type = ir->lod_info.sample_index->type;
2568
2569 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2570 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2571 else
2572 mcs = src_reg(0u);
2573 break;
2574 case ir_txd:
2575 ir->lod_info.grad.dPdx->accept(this);
2576 dPdx = this->result;
2577
2578 ir->lod_info.grad.dPdy->accept(this);
2579 dPdy = this->result;
2580
2581 lod_type = ir->lod_info.grad.dPdx->type;
2582 break;
2583 case ir_txb:
2584 case ir_lod:
2585 case ir_tg4:
2586 break;
2587 }
2588
2589 enum opcode opcode;
2590 switch (ir->op) {
2591 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2592 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2593 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2594 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2595 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2596 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2597 case ir_tg4: opcode = has_nonconstant_offset
2598 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2599 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2600 case ir_txb:
2601 unreachable("TXB is not valid for vertex shaders.");
2602 case ir_lod:
2603 unreachable("LOD is not valid for vertex shaders.");
2604 default:
2605 unreachable("Unrecognized tex op");
2606 }
2607
2608 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2609 opcode, dst_reg(this, ir->type));
2610
2611 if (ir->offset != NULL && !has_nonconstant_offset) {
2612 inst->offset =
2613 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2614 ir->offset->type->vector_elements);
2615 }
2616
2617 /* Stuff the channel select bits in the top of the texture offset */
2618 if (ir->op == ir_tg4)
2619 inst->offset |= gather_channel(ir, sampler) << 16;
2620
2621 /* The message header is necessary for:
2622 * - Gen4 (always)
2623 * - Gen9+ for selecting SIMD4x2
2624 * - Texel offsets
2625 * - Gather channel selection
2626 * - Sampler indices too large to fit in a 4-bit value.
2627 */
2628 inst->header_present =
2629 brw->gen < 5 || brw->gen >= 9 ||
2630 inst->offset != 0 || ir->op == ir_tg4 ||
2631 is_high_sampler(brw, sampler_reg);
2632 inst->base_mrf = 2;
2633 inst->mlen = inst->header_present + 1; /* always at least one */
2634 inst->dst.writemask = WRITEMASK_XYZW;
2635 inst->shadow_compare = ir->shadow_comparitor != NULL;
2636
2637 inst->src[1] = sampler_reg;
2638
2639 /* MRF for the first parameter */
2640 int param_base = inst->base_mrf + inst->header_present;
2641
2642 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2643 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2644 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2645 } else {
2646 /* Load the coordinate */
2647 /* FINISHME: gl_clamp_mask and saturate */
2648 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2649 int zero_mask = 0xf & ~coord_mask;
2650
2651 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2652 coordinate));
2653
2654 if (zero_mask != 0) {
2655 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2656 src_reg(0)));
2657 }
2658 /* Load the shadow comparitor */
2659 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2660 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2661 WRITEMASK_X),
2662 shadow_comparitor));
2663 inst->mlen++;
2664 }
2665
2666 /* Load the LOD info */
2667 if (ir->op == ir_tex || ir->op == ir_txl) {
2668 int mrf, writemask;
2669 if (brw->gen >= 5) {
2670 mrf = param_base + 1;
2671 if (ir->shadow_comparitor) {
2672 writemask = WRITEMASK_Y;
2673 /* mlen already incremented */
2674 } else {
2675 writemask = WRITEMASK_X;
2676 inst->mlen++;
2677 }
2678 } else /* brw->gen == 4 */ {
2679 mrf = param_base;
2680 writemask = WRITEMASK_W;
2681 }
2682 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2683 } else if (ir->op == ir_txf) {
2684 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2685 } else if (ir->op == ir_txf_ms) {
2686 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2687 sample_index));
2688 if (brw->gen >= 7) {
2689 /* MCS data is in the first channel of `mcs`, but we need to get it into
2690 * the .y channel of the second vec4 of params, so replicate .x across
2691 * the whole vec4 and then mask off everything except .y
2692 */
2693 mcs.swizzle = BRW_SWIZZLE_XXXX;
2694 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2695 mcs));
2696 }
2697 inst->mlen++;
2698 } else if (ir->op == ir_txd) {
2699 const glsl_type *type = lod_type;
2700
2701 if (brw->gen >= 5) {
2702 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2703 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2704 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2705 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2706 inst->mlen++;
2707
2708 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2709 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2710 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2711 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2712 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2713 inst->mlen++;
2714
2715 if (ir->shadow_comparitor) {
2716 emit(MOV(dst_reg(MRF, param_base + 2,
2717 ir->shadow_comparitor->type, WRITEMASK_Z),
2718 shadow_comparitor));
2719 }
2720 }
2721 } else /* brw->gen == 4 */ {
2722 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2723 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2724 inst->mlen += 2;
2725 }
2726 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2727 if (ir->shadow_comparitor) {
2728 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2729 shadow_comparitor));
2730 }
2731
2732 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2733 offset_value));
2734 inst->mlen++;
2735 }
2736 }
2737
2738 emit(inst);
2739
2740 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2741 * spec requires layers.
2742 */
2743 if (ir->op == ir_txs) {
2744 glsl_type const *type = ir->sampler->type;
2745 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2746 type->sampler_array) {
2747 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2748 writemask(inst->dst, WRITEMASK_Z),
2749 src_reg(inst->dst), src_reg(6));
2750 }
2751 }
2752
2753 if (brw->gen == 6 && ir->op == ir_tg4) {
2754 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2755 }
2756
2757 swizzle_result(ir, src_reg(inst->dst), sampler);
2758 }
2759
2760 /**
2761 * Apply workarounds for Gen6 gather with UINT/SINT
2762 */
2763 void
2764 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2765 {
2766 if (!wa)
2767 return;
2768
2769 int width = (wa & WA_8BIT) ? 8 : 16;
2770 dst_reg dst_f = dst;
2771 dst_f.type = BRW_REGISTER_TYPE_F;
2772
2773 /* Convert from UNORM to UINT */
2774 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2775 emit(MOV(dst, src_reg(dst_f)));
2776
2777 if (wa & WA_SIGN) {
2778 /* Reinterpret the UINT value as a signed INT value by
2779 * shifting the sign bit into place, then shifting back
2780 * preserving sign.
2781 */
2782 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2783 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2784 }
2785 }
2786
2787 /**
2788 * Set up the gather channel based on the swizzle, for gather4.
2789 */
2790 uint32_t
2791 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2792 {
2793 ir_constant *chan = ir->lod_info.component->as_constant();
2794 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2795 switch (swiz) {
2796 case SWIZZLE_X: return 0;
2797 case SWIZZLE_Y:
2798 /* gather4 sampler is broken for green channel on RG32F --
2799 * we must ask for blue instead.
2800 */
2801 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2802 return 2;
2803 return 1;
2804 case SWIZZLE_Z: return 2;
2805 case SWIZZLE_W: return 3;
2806 default:
2807 unreachable("Not reached"); /* zero, one swizzles handled already */
2808 }
2809 }
2810
2811 void
2812 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2813 {
2814 int s = key->tex.swizzles[sampler];
2815
2816 this->result = src_reg(this, ir->type);
2817 dst_reg swizzled_result(this->result);
2818
2819 if (ir->op == ir_query_levels) {
2820 /* # levels is in .w */
2821 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2822 emit(MOV(swizzled_result, orig_val));
2823 return;
2824 }
2825
2826 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2827 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2828 emit(MOV(swizzled_result, orig_val));
2829 return;
2830 }
2831
2832
2833 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2834 int swizzle[4] = {0};
2835
2836 for (int i = 0; i < 4; i++) {
2837 switch (GET_SWZ(s, i)) {
2838 case SWIZZLE_ZERO:
2839 zero_mask |= (1 << i);
2840 break;
2841 case SWIZZLE_ONE:
2842 one_mask |= (1 << i);
2843 break;
2844 default:
2845 copy_mask |= (1 << i);
2846 swizzle[i] = GET_SWZ(s, i);
2847 break;
2848 }
2849 }
2850
2851 if (copy_mask) {
2852 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2853 swizzled_result.writemask = copy_mask;
2854 emit(MOV(swizzled_result, orig_val));
2855 }
2856
2857 if (zero_mask) {
2858 swizzled_result.writemask = zero_mask;
2859 emit(MOV(swizzled_result, src_reg(0.0f)));
2860 }
2861
2862 if (one_mask) {
2863 swizzled_result.writemask = one_mask;
2864 emit(MOV(swizzled_result, src_reg(1.0f)));
2865 }
2866 }
2867
2868 void
2869 vec4_visitor::visit(ir_return *)
2870 {
2871 unreachable("not reached");
2872 }
2873
2874 void
2875 vec4_visitor::visit(ir_discard *)
2876 {
2877 unreachable("not reached");
2878 }
2879
2880 void
2881 vec4_visitor::visit(ir_if *ir)
2882 {
2883 /* Don't point the annotation at the if statement, because then it plus
2884 * the then and else blocks get printed.
2885 */
2886 this->base_ir = ir->condition;
2887
2888 if (brw->gen == 6) {
2889 emit_if_gen6(ir);
2890 } else {
2891 enum brw_predicate predicate;
2892 emit_bool_to_cond_code(ir->condition, &predicate);
2893 emit(IF(predicate));
2894 }
2895
2896 visit_instructions(&ir->then_instructions);
2897
2898 if (!ir->else_instructions.is_empty()) {
2899 this->base_ir = ir->condition;
2900 emit(BRW_OPCODE_ELSE);
2901
2902 visit_instructions(&ir->else_instructions);
2903 }
2904
2905 this->base_ir = ir->condition;
2906 emit(BRW_OPCODE_ENDIF);
2907 }
2908
2909 void
2910 vec4_visitor::visit(ir_emit_vertex *)
2911 {
2912 unreachable("not reached");
2913 }
2914
2915 void
2916 vec4_visitor::visit(ir_end_primitive *)
2917 {
2918 unreachable("not reached");
2919 }
2920
2921 void
2922 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2923 dst_reg dst, src_reg offset,
2924 src_reg src0, src_reg src1)
2925 {
2926 unsigned mlen = 0;
2927
2928 /* Set the atomic operation offset. */
2929 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2930 mlen++;
2931
2932 /* Set the atomic operation arguments. */
2933 if (src0.file != BAD_FILE) {
2934 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2935 mlen++;
2936 }
2937
2938 if (src1.file != BAD_FILE) {
2939 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2940 mlen++;
2941 }
2942
2943 /* Emit the instruction. Note that this maps to the normal SIMD8
2944 * untyped atomic message on Ivy Bridge, but that's OK because
2945 * unused channels will be masked out.
2946 */
2947 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2948 src_reg(atomic_op), src_reg(surf_index));
2949 inst->base_mrf = 0;
2950 inst->mlen = mlen;
2951 }
2952
2953 void
2954 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2955 src_reg offset)
2956 {
2957 /* Set the surface read offset. */
2958 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2959
2960 /* Emit the instruction. Note that this maps to the normal SIMD8
2961 * untyped surface read message, but that's OK because unused
2962 * channels will be masked out.
2963 */
2964 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2965 dst, src_reg(surf_index));
2966 inst->base_mrf = 0;
2967 inst->mlen = 1;
2968 }
2969
2970 void
2971 vec4_visitor::emit_ndc_computation()
2972 {
2973 /* Get the position */
2974 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2975
2976 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2977 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2978 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2979
2980 current_annotation = "NDC";
2981 dst_reg ndc_w = ndc;
2982 ndc_w.writemask = WRITEMASK_W;
2983 src_reg pos_w = pos;
2984 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2985 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2986
2987 dst_reg ndc_xyz = ndc;
2988 ndc_xyz.writemask = WRITEMASK_XYZ;
2989
2990 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2991 }
2992
2993 void
2994 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2995 {
2996 if (brw->gen < 6 &&
2997 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2998 key->userclip_active || brw->has_negative_rhw_bug)) {
2999 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3000 dst_reg header1_w = header1;
3001 header1_w.writemask = WRITEMASK_W;
3002
3003 emit(MOV(header1, 0u));
3004
3005 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3006 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3007
3008 current_annotation = "Point size";
3009 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3010 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3011 }
3012
3013 if (key->userclip_active) {
3014 current_annotation = "Clipping flags";
3015 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3016 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3017
3018 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3019 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3020 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3021
3022 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3023 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3024 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3025 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3026 }
3027
3028 /* i965 clipping workaround:
3029 * 1) Test for -ve rhw
3030 * 2) If set,
3031 * set ndc = (0,0,0,0)
3032 * set ucp[6] = 1
3033 *
3034 * Later, clipping will detect ucp[6] and ensure the primitive is
3035 * clipped against all fixed planes.
3036 */
3037 if (brw->has_negative_rhw_bug) {
3038 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3039 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3040 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3041 vec4_instruction *inst;
3042 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3043 inst->predicate = BRW_PREDICATE_NORMAL;
3044 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3045 inst->predicate = BRW_PREDICATE_NORMAL;
3046 }
3047
3048 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3049 } else if (brw->gen < 6) {
3050 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3051 } else {
3052 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3053 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3054 dst_reg reg_w = reg;
3055 reg_w.writemask = WRITEMASK_W;
3056 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3057 }
3058 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3059 dst_reg reg_y = reg;
3060 reg_y.writemask = WRITEMASK_Y;
3061 reg_y.type = BRW_REGISTER_TYPE_D;
3062 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3063 }
3064 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3065 dst_reg reg_z = reg;
3066 reg_z.writemask = WRITEMASK_Z;
3067 reg_z.type = BRW_REGISTER_TYPE_D;
3068 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3069 }
3070 }
3071 }
3072
3073 void
3074 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3075 {
3076 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3077 *
3078 * "If a linked set of shaders forming the vertex stage contains no
3079 * static write to gl_ClipVertex or gl_ClipDistance, but the
3080 * application has requested clipping against user clip planes through
3081 * the API, then the coordinate written to gl_Position is used for
3082 * comparison against the user clip planes."
3083 *
3084 * This function is only called if the shader didn't write to
3085 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3086 * if the user wrote to it; otherwise we use gl_Position.
3087 */
3088 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3089 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3090 clip_vertex = VARYING_SLOT_POS;
3091 }
3092
3093 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3094 ++i) {
3095 reg.writemask = 1 << i;
3096 emit(DP4(reg,
3097 src_reg(output_reg[clip_vertex]),
3098 src_reg(this->userplane[i + offset])));
3099 }
3100 }
3101
3102 vec4_instruction *
3103 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3104 {
3105 assert (varying < VARYING_SLOT_MAX);
3106 reg.type = output_reg[varying].type;
3107 current_annotation = output_reg_annotation[varying];
3108 /* Copy the register, saturating if necessary */
3109 return emit(MOV(reg, src_reg(output_reg[varying])));
3110 }
3111
3112 void
3113 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3114 {
3115 reg.type = BRW_REGISTER_TYPE_F;
3116
3117 switch (varying) {
3118 case VARYING_SLOT_PSIZ:
3119 {
3120 /* PSIZ is always in slot 0, and is coupled with other flags. */
3121 current_annotation = "indices, point width, clip flags";
3122 emit_psiz_and_flags(reg);
3123 break;
3124 }
3125 case BRW_VARYING_SLOT_NDC:
3126 current_annotation = "NDC";
3127 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3128 break;
3129 case VARYING_SLOT_POS:
3130 current_annotation = "gl_Position";
3131 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3132 break;
3133 case VARYING_SLOT_EDGE:
3134 /* This is present when doing unfilled polygons. We're supposed to copy
3135 * the edge flag from the user-provided vertex array
3136 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3137 * of that attribute (starts as 1.0f). This is then used in clipping to
3138 * determine which edges should be drawn as wireframe.
3139 */
3140 current_annotation = "edge flag";
3141 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3142 glsl_type::float_type, WRITEMASK_XYZW))));
3143 break;
3144 case BRW_VARYING_SLOT_PAD:
3145 /* No need to write to this slot */
3146 break;
3147 case VARYING_SLOT_COL0:
3148 case VARYING_SLOT_COL1:
3149 case VARYING_SLOT_BFC0:
3150 case VARYING_SLOT_BFC1: {
3151 /* These built-in varyings are only supported in compatibility mode,
3152 * and we only support GS in core profile. So, this must be a vertex
3153 * shader.
3154 */
3155 assert(stage == MESA_SHADER_VERTEX);
3156 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3157 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3158 inst->saturate = true;
3159 break;
3160 }
3161
3162 default:
3163 emit_generic_urb_slot(reg, varying);
3164 break;
3165 }
3166 }
3167
3168 static int
3169 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3170 {
3171 if (brw->gen >= 6) {
3172 /* URB data written (does not include the message header reg) must
3173 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3174 * section 5.4.3.2.2: URB_INTERLEAVED.
3175 *
3176 * URB entries are allocated on a multiple of 1024 bits, so an
3177 * extra 128 bits written here to make the end align to 256 is
3178 * no problem.
3179 */
3180 if ((mlen % 2) != 1)
3181 mlen++;
3182 }
3183
3184 return mlen;
3185 }
3186
3187
3188 /**
3189 * Generates the VUE payload plus the necessary URB write instructions to
3190 * output it.
3191 *
3192 * The VUE layout is documented in Volume 2a.
3193 */
3194 void
3195 vec4_visitor::emit_vertex()
3196 {
3197 /* MRF 0 is reserved for the debugger, so start with message header
3198 * in MRF 1.
3199 */
3200 int base_mrf = 1;
3201 int mrf = base_mrf;
3202 /* In the process of generating our URB write message contents, we
3203 * may need to unspill a register or load from an array. Those
3204 * reads would use MRFs 14-15.
3205 */
3206 int max_usable_mrf = 13;
3207
3208 /* The following assertion verifies that max_usable_mrf causes an
3209 * even-numbered amount of URB write data, which will meet gen6's
3210 * requirements for length alignment.
3211 */
3212 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3213
3214 /* First mrf is the g0-based message header containing URB handles and
3215 * such.
3216 */
3217 emit_urb_write_header(mrf++);
3218
3219 if (brw->gen < 6) {
3220 emit_ndc_computation();
3221 }
3222
3223 /* Lower legacy ff and ClipVertex clipping to clip distances */
3224 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3225 current_annotation = "user clip distances";
3226
3227 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3228 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3229
3230 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3231 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3232 }
3233
3234 /* We may need to split this up into several URB writes, so do them in a
3235 * loop.
3236 */
3237 int slot = 0;
3238 bool complete = false;
3239 do {
3240 /* URB offset is in URB row increments, and each of our MRFs is half of
3241 * one of those, since we're doing interleaved writes.
3242 */
3243 int offset = slot / 2;
3244
3245 mrf = base_mrf + 1;
3246 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3247 emit_urb_slot(dst_reg(MRF, mrf++),
3248 prog_data->vue_map.slot_to_varying[slot]);
3249
3250 /* If this was max_usable_mrf, we can't fit anything more into this
3251 * URB WRITE.
3252 */
3253 if (mrf > max_usable_mrf) {
3254 slot++;
3255 break;
3256 }
3257 }
3258
3259 complete = slot >= prog_data->vue_map.num_slots;
3260 current_annotation = "URB write";
3261 vec4_instruction *inst = emit_urb_write_opcode(complete);
3262 inst->base_mrf = base_mrf;
3263 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3264 inst->offset += offset;
3265 } while(!complete);
3266 }
3267
3268
3269 src_reg
3270 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3271 src_reg *reladdr, int reg_offset)
3272 {
3273 /* Because we store the values to scratch interleaved like our
3274 * vertex data, we need to scale the vec4 index by 2.
3275 */
3276 int message_header_scale = 2;
3277
3278 /* Pre-gen6, the message header uses byte offsets instead of vec4
3279 * (16-byte) offset units.
3280 */
3281 if (brw->gen < 6)
3282 message_header_scale *= 16;
3283
3284 if (reladdr) {
3285 src_reg index = src_reg(this, glsl_type::int_type);
3286
3287 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3288 src_reg(reg_offset)));
3289 emit_before(block, inst, MUL(dst_reg(index), index,
3290 src_reg(message_header_scale)));
3291
3292 return index;
3293 } else {
3294 return src_reg(reg_offset * message_header_scale);
3295 }
3296 }
3297
3298 src_reg
3299 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3300 src_reg *reladdr, int reg_offset)
3301 {
3302 if (reladdr) {
3303 src_reg index = src_reg(this, glsl_type::int_type);
3304
3305 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3306 src_reg(reg_offset)));
3307
3308 /* Pre-gen6, the message header uses byte offsets instead of vec4
3309 * (16-byte) offset units.
3310 */
3311 if (brw->gen < 6) {
3312 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3313 }
3314
3315 return index;
3316 } else if (brw->gen >= 8) {
3317 /* Store the offset in a GRF so we can send-from-GRF. */
3318 src_reg offset = src_reg(this, glsl_type::int_type);
3319 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3320 return offset;
3321 } else {
3322 int message_header_scale = brw->gen < 6 ? 16 : 1;
3323 return src_reg(reg_offset * message_header_scale);
3324 }
3325 }
3326
3327 /**
3328 * Emits an instruction before @inst to load the value named by @orig_src
3329 * from scratch space at @base_offset to @temp.
3330 *
3331 * @base_offset is measured in 32-byte units (the size of a register).
3332 */
3333 void
3334 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3335 dst_reg temp, src_reg orig_src,
3336 int base_offset)
3337 {
3338 int reg_offset = base_offset + orig_src.reg_offset;
3339 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3340 reg_offset);
3341
3342 emit_before(block, inst, SCRATCH_READ(temp, index));
3343 }
3344
3345 /**
3346 * Emits an instruction after @inst to store the value to be written
3347 * to @orig_dst to scratch space at @base_offset, from @temp.
3348 *
3349 * @base_offset is measured in 32-byte units (the size of a register).
3350 */
3351 void
3352 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3353 int base_offset)
3354 {
3355 int reg_offset = base_offset + inst->dst.reg_offset;
3356 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3357 reg_offset);
3358
3359 /* Create a temporary register to store *inst's result in.
3360 *
3361 * We have to be careful in MOVing from our temporary result register in
3362 * the scratch write. If we swizzle from channels of the temporary that
3363 * weren't initialized, it will confuse live interval analysis, which will
3364 * make spilling fail to make progress.
3365 */
3366 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3367 inst->dst.type),
3368 brw_swizzle_for_mask(inst->dst.writemask));
3369 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3370 inst->dst.writemask));
3371 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3372 write->predicate = inst->predicate;
3373 write->ir = inst->ir;
3374 write->annotation = inst->annotation;
3375 inst->insert_after(block, write);
3376
3377 inst->dst.file = temp.file;
3378 inst->dst.reg = temp.reg;
3379 inst->dst.reg_offset = temp.reg_offset;
3380 inst->dst.reladdr = NULL;
3381 }
3382
3383 /**
3384 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3385 * adds the scratch read(s) before \p inst. The function also checks for
3386 * recursive reladdr scratch accesses, issuing the corresponding scratch
3387 * loads and rewriting reladdr references accordingly.
3388 *
3389 * \return \p src if it did not require a scratch load, otherwise, the
3390 * register holding the result of the scratch load that the caller should
3391 * use to rewrite src.
3392 */
3393 src_reg
3394 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3395 vec4_instruction *inst, src_reg src)
3396 {
3397 /* Resolve recursive reladdr scratch access by calling ourselves
3398 * with src.reladdr
3399 */
3400 if (src.reladdr)
3401 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3402 *src.reladdr);
3403
3404 /* Now handle scratch access on src */
3405 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3406 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3407 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3408 src.reg = temp.reg;
3409 src.reg_offset = temp.reg_offset;
3410 src.reladdr = NULL;
3411 }
3412
3413 return src;
3414 }
3415
3416 /**
3417 * We can't generally support array access in GRF space, because a
3418 * single instruction's destination can only span 2 contiguous
3419 * registers. So, we send all GRF arrays that get variable index
3420 * access to scratch space.
3421 */
3422 void
3423 vec4_visitor::move_grf_array_access_to_scratch()
3424 {
3425 int scratch_loc[this->alloc.count];
3426 memset(scratch_loc, -1, sizeof(scratch_loc));
3427
3428 /* First, calculate the set of virtual GRFs that need to be punted
3429 * to scratch due to having any array access on them, and where in
3430 * scratch.
3431 */
3432 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3433 if (inst->dst.file == GRF && inst->dst.reladdr) {
3434 if (scratch_loc[inst->dst.reg] == -1) {
3435 scratch_loc[inst->dst.reg] = c->last_scratch;
3436 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3437 }
3438
3439 for (src_reg *iter = inst->dst.reladdr;
3440 iter->reladdr;
3441 iter = iter->reladdr) {
3442 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3443 scratch_loc[iter->reg] = c->last_scratch;
3444 c->last_scratch += this->alloc.sizes[iter->reg];
3445 }
3446 }
3447 }
3448
3449 for (int i = 0 ; i < 3; i++) {
3450 for (src_reg *iter = &inst->src[i];
3451 iter->reladdr;
3452 iter = iter->reladdr) {
3453 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3454 scratch_loc[iter->reg] = c->last_scratch;
3455 c->last_scratch += this->alloc.sizes[iter->reg];
3456 }
3457 }
3458 }
3459 }
3460
3461 /* Now, for anything that will be accessed through scratch, rewrite
3462 * it to load/store. Note that this is a _safe list walk, because
3463 * we may generate a new scratch_write instruction after the one
3464 * we're processing.
3465 */
3466 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3467 /* Set up the annotation tracking for new generated instructions. */
3468 base_ir = inst->ir;
3469 current_annotation = inst->annotation;
3470
3471 /* First handle scratch access on the dst. Notice we have to handle
3472 * the case where the dst's reladdr also points to scratch space.
3473 */
3474 if (inst->dst.reladdr)
3475 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3476 *inst->dst.reladdr);
3477
3478 /* Now that we have handled any (possibly recursive) reladdr scratch
3479 * accesses for dst we can safely do the scratch write for dst itself
3480 */
3481 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3482 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3483
3484 /* Now handle scratch access on any src. In this case, since inst->src[i]
3485 * already is a src_reg, we can just call emit_resolve_reladdr with
3486 * inst->src[i] and it will take care of handling scratch loads for
3487 * both src and src.reladdr (recursively).
3488 */
3489 for (int i = 0 ; i < 3; i++) {
3490 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3491 inst->src[i]);
3492 }
3493 }
3494 }
3495
3496 /**
3497 * Emits an instruction before @inst to load the value named by @orig_src
3498 * from the pull constant buffer (surface) at @base_offset to @temp.
3499 */
3500 void
3501 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3502 dst_reg temp, src_reg orig_src,
3503 int base_offset)
3504 {
3505 int reg_offset = base_offset + orig_src.reg_offset;
3506 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3507 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3508 reg_offset);
3509
3510 emit_pull_constant_load_reg(temp,
3511 index,
3512 offset,
3513 block, inst);
3514 }
3515
3516 /**
3517 * Implements array access of uniforms by inserting a
3518 * PULL_CONSTANT_LOAD instruction.
3519 *
3520 * Unlike temporary GRF array access (where we don't support it due to
3521 * the difficulty of doing relative addressing on instruction
3522 * destinations), we could potentially do array access of uniforms
3523 * that were loaded in GRF space as push constants. In real-world
3524 * usage we've seen, though, the arrays being used are always larger
3525 * than we could load as push constants, so just always move all
3526 * uniform array access out to a pull constant buffer.
3527 */
3528 void
3529 vec4_visitor::move_uniform_array_access_to_pull_constants()
3530 {
3531 int pull_constant_loc[this->uniforms];
3532 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3533 bool nested_reladdr;
3534
3535 /* Walk through and find array access of uniforms. Put a copy of that
3536 * uniform in the pull constant buffer.
3537 *
3538 * Note that we don't move constant-indexed accesses to arrays. No
3539 * testing has been done of the performance impact of this choice.
3540 */
3541 do {
3542 nested_reladdr = false;
3543
3544 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3545 for (int i = 0 ; i < 3; i++) {
3546 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3547 continue;
3548
3549 int uniform = inst->src[i].reg;
3550
3551 if (inst->src[i].reladdr->reladdr)
3552 nested_reladdr = true; /* will need another pass */
3553
3554 /* If this array isn't already present in the pull constant buffer,
3555 * add it.
3556 */
3557 if (pull_constant_loc[uniform] == -1) {
3558 const gl_constant_value **values =
3559 &stage_prog_data->param[uniform * 4];
3560
3561 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3562
3563 assert(uniform < uniform_array_size);
3564 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3565 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3566 = values[j];
3567 }
3568 }
3569
3570 /* Set up the annotation tracking for new generated instructions. */
3571 base_ir = inst->ir;
3572 current_annotation = inst->annotation;
3573
3574 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3575
3576 emit_pull_constant_load(block, inst, temp, inst->src[i],
3577 pull_constant_loc[uniform]);
3578
3579 inst->src[i].file = temp.file;
3580 inst->src[i].reg = temp.reg;
3581 inst->src[i].reg_offset = temp.reg_offset;
3582 inst->src[i].reladdr = NULL;
3583 }
3584 }
3585 } while (nested_reladdr);
3586
3587 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3588 * no need to track them as larger-than-vec4 objects. This will be
3589 * relied on in cutting out unused uniform vectors from push
3590 * constants.
3591 */
3592 split_uniform_registers();
3593 }
3594
3595 void
3596 vec4_visitor::resolve_ud_negate(src_reg *reg)
3597 {
3598 if (reg->type != BRW_REGISTER_TYPE_UD ||
3599 !reg->negate)
3600 return;
3601
3602 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3603 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3604 *reg = temp;
3605 }
3606
3607 /**
3608 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3609 *
3610 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3611 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3612 */
3613 void
3614 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3615 {
3616 assert(brw->gen <= 5);
3617
3618 if (!rvalue->type->is_boolean())
3619 return;
3620
3621 src_reg and_result = src_reg(this, rvalue->type);
3622 src_reg neg_result = src_reg(this, rvalue->type);
3623 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3624 emit(MOV(dst_reg(neg_result), negate(and_result)));
3625 *reg = neg_result;
3626 }
3627
3628 vec4_visitor::vec4_visitor(struct brw_context *brw,
3629 struct brw_vec4_compile *c,
3630 struct gl_program *prog,
3631 const struct brw_vue_prog_key *key,
3632 struct brw_vue_prog_data *prog_data,
3633 struct gl_shader_program *shader_prog,
3634 gl_shader_stage stage,
3635 void *mem_ctx,
3636 bool no_spills,
3637 shader_time_shader_type st_base,
3638 shader_time_shader_type st_written,
3639 shader_time_shader_type st_reset)
3640 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3641 c(c),
3642 key(key),
3643 prog_data(prog_data),
3644 sanity_param_count(0),
3645 fail_msg(NULL),
3646 first_non_payload_grf(0),
3647 need_all_constants_in_pull_buffer(false),
3648 no_spills(no_spills),
3649 st_base(st_base),
3650 st_written(st_written),
3651 st_reset(st_reset)
3652 {
3653 this->mem_ctx = mem_ctx;
3654 this->failed = false;
3655
3656 this->base_ir = NULL;
3657 this->current_annotation = NULL;
3658 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3659
3660 this->variable_ht = hash_table_ctor(0,
3661 hash_table_pointer_hash,
3662 hash_table_pointer_compare);
3663
3664 this->virtual_grf_start = NULL;
3665 this->virtual_grf_end = NULL;
3666 this->live_intervals = NULL;
3667
3668 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3669
3670 this->uniforms = 0;
3671
3672 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3673 * at least one. See setup_uniforms() in brw_vec4.cpp.
3674 */
3675 this->uniform_array_size = 1;
3676 if (prog_data) {
3677 this->uniform_array_size =
3678 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3679 }
3680
3681 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3682 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3683 }
3684
3685 vec4_visitor::~vec4_visitor()
3686 {
3687 hash_table_dtor(this->variable_ht);
3688 }
3689
3690
3691 void
3692 vec4_visitor::fail(const char *format, ...)
3693 {
3694 va_list va;
3695 char *msg;
3696
3697 if (failed)
3698 return;
3699
3700 failed = true;
3701
3702 va_start(va, format);
3703 msg = ralloc_vasprintf(mem_ctx, format, va);
3704 va_end(va);
3705 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3706
3707 this->fail_msg = msg;
3708
3709 if (debug_enabled) {
3710 fprintf(stderr, "%s", msg);
3711 }
3712 }
3713
3714 } /* namespace brw */