i965/vec4: Add and use byte-MOV instruction for unpack 4x8.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(brw->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (brw->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (brw->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (brw->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (strncmp(ir->name, storage->name, namelen) != 0 ||
690 (storage->name[namelen] != 0 &&
691 storage->name[namelen] != '.' &&
692 storage->name[namelen] != '[')) {
693 continue;
694 }
695
696 gl_constant_value *components = storage->storage;
697 unsigned vector_count = (MAX2(storage->array_elements, 1) *
698 storage->type->matrix_columns);
699
700 for (unsigned s = 0; s < vector_count; s++) {
701 assert(uniforms < uniform_array_size);
702 uniform_vector_size[uniforms] = storage->type->vector_elements;
703
704 int i;
705 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
706 stage_prog_data->param[uniforms * 4 + i] = components;
707 components++;
708 }
709 for (; i < 4; i++) {
710 static gl_constant_value zero = { 0.0 };
711 stage_prog_data->param[uniforms * 4 + i] = &zero;
712 }
713
714 uniforms++;
715 }
716 }
717 }
718
719 void
720 vec4_visitor::setup_uniform_clipplane_values()
721 {
722 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
723
724 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 4;
727 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
728 this->userplane[i].type = BRW_REGISTER_TYPE_F;
729 for (int j = 0; j < 4; ++j) {
730 stage_prog_data->param[this->uniforms * 4 + j] =
731 (gl_constant_value *) &clip_planes[i][j];
732 }
733 ++this->uniforms;
734 }
735 }
736
737 /* Our support for builtin uniforms is even scarier than non-builtin.
738 * It sits on top of the PROG_STATE_VAR parameters that are
739 * automatically updated from GL context state.
740 */
741 void
742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
743 {
744 const ir_state_slot *const slots = ir->get_state_slots();
745 assert(slots != NULL);
746
747 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
748 /* This state reference has already been setup by ir_to_mesa,
749 * but we'll get the same index back here. We can reference
750 * ParameterValues directly, since unlike brw_fs.cpp, we never
751 * add new state references during compile.
752 */
753 int index = _mesa_add_state_reference(this->prog->Parameters,
754 (gl_state_index *)slots[i].tokens);
755 gl_constant_value *values =
756 &this->prog->Parameters->ParameterValues[index][0];
757
758 assert(this->uniforms < uniform_array_size);
759 this->uniform_vector_size[this->uniforms] = 0;
760 /* Add each of the unique swizzled channels of the element.
761 * This will end up matching the size of the glsl_type of this field.
762 */
763 int last_swiz = -1;
764 for (unsigned int j = 0; j < 4; j++) {
765 int swiz = GET_SWZ(slots[i].swizzle, j);
766 last_swiz = swiz;
767
768 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
769 assert(this->uniforms < uniform_array_size);
770 if (swiz <= last_swiz)
771 this->uniform_vector_size[this->uniforms]++;
772 }
773 this->uniforms++;
774 }
775 }
776
777 dst_reg *
778 vec4_visitor::variable_storage(ir_variable *var)
779 {
780 return (dst_reg *)hash_table_find(this->variable_ht, var);
781 }
782
783 void
784 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
785 enum brw_predicate *predicate)
786 {
787 ir_expression *expr = ir->as_expression();
788
789 *predicate = BRW_PREDICATE_NORMAL;
790
791 if (expr && expr->operation != ir_binop_ubo_load) {
792 src_reg op[3];
793 vec4_instruction *inst;
794
795 assert(expr->get_num_operands() <= 3);
796 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
797 expr->operands[i]->accept(this);
798 op[i] = this->result;
799
800 resolve_ud_negate(&op[i]);
801 }
802
803 switch (expr->operation) {
804 case ir_unop_logic_not:
805 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
806 inst->conditional_mod = BRW_CONDITIONAL_Z;
807 break;
808
809 case ir_binop_logic_xor:
810 if (brw->gen <= 5) {
811 src_reg temp = src_reg(this, ir->type);
812 emit(XOR(dst_reg(temp), op[0], op[1]));
813 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
814 } else {
815 inst = emit(XOR(dst_null_d(), op[0], op[1]));
816 }
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 break;
819
820 case ir_binop_logic_or:
821 if (brw->gen <= 5) {
822 src_reg temp = src_reg(this, ir->type);
823 emit(OR(dst_reg(temp), op[0], op[1]));
824 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
825 } else {
826 inst = emit(OR(dst_null_d(), op[0], op[1]));
827 }
828 inst->conditional_mod = BRW_CONDITIONAL_NZ;
829 break;
830
831 case ir_binop_logic_and:
832 if (brw->gen <= 5) {
833 src_reg temp = src_reg(this, ir->type);
834 emit(AND(dst_reg(temp), op[0], op[1]));
835 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
836 } else {
837 inst = emit(AND(dst_null_d(), op[0], op[1]));
838 }
839 inst->conditional_mod = BRW_CONDITIONAL_NZ;
840 break;
841
842 case ir_unop_f2b:
843 if (brw->gen >= 6) {
844 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
845 } else {
846 inst = emit(MOV(dst_null_f(), op[0]));
847 inst->conditional_mod = BRW_CONDITIONAL_NZ;
848 }
849 break;
850
851 case ir_unop_i2b:
852 if (brw->gen >= 6) {
853 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
854 } else {
855 inst = emit(MOV(dst_null_d(), op[0]));
856 inst->conditional_mod = BRW_CONDITIONAL_NZ;
857 }
858 break;
859
860 case ir_binop_all_equal:
861 if (brw->gen <= 5) {
862 resolve_bool_comparison(expr->operands[0], &op[0]);
863 resolve_bool_comparison(expr->operands[1], &op[1]);
864 }
865 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
866 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
867 break;
868
869 case ir_binop_any_nequal:
870 if (brw->gen <= 5) {
871 resolve_bool_comparison(expr->operands[0], &op[0]);
872 resolve_bool_comparison(expr->operands[1], &op[1]);
873 }
874 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
875 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
876 break;
877
878 case ir_unop_any:
879 if (brw->gen <= 5) {
880 resolve_bool_comparison(expr->operands[0], &op[0]);
881 }
882 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
883 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
884 break;
885
886 case ir_binop_greater:
887 case ir_binop_gequal:
888 case ir_binop_less:
889 case ir_binop_lequal:
890 case ir_binop_equal:
891 case ir_binop_nequal:
892 if (brw->gen <= 5) {
893 resolve_bool_comparison(expr->operands[0], &op[0]);
894 resolve_bool_comparison(expr->operands[1], &op[1]);
895 }
896 emit(CMP(dst_null_d(), op[0], op[1],
897 brw_conditional_for_comparison(expr->operation)));
898 break;
899
900 case ir_triop_csel: {
901 /* Expand the boolean condition into the flag register. */
902 inst = emit(MOV(dst_null_d(), op[0]));
903 inst->conditional_mod = BRW_CONDITIONAL_NZ;
904
905 /* Select which boolean to return. */
906 dst_reg temp(this, expr->operands[1]->type);
907 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
908 inst->predicate = BRW_PREDICATE_NORMAL;
909
910 /* Expand the result to a condition code. */
911 inst = emit(MOV(dst_null_d(), src_reg(temp)));
912 inst->conditional_mod = BRW_CONDITIONAL_NZ;
913 break;
914 }
915
916 default:
917 unreachable("not reached");
918 }
919 return;
920 }
921
922 ir->accept(this);
923
924 resolve_ud_negate(&this->result);
925
926 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
927 inst->conditional_mod = BRW_CONDITIONAL_NZ;
928 }
929
930 /**
931 * Emit a gen6 IF statement with the comparison folded into the IF
932 * instruction.
933 */
934 void
935 vec4_visitor::emit_if_gen6(ir_if *ir)
936 {
937 ir_expression *expr = ir->condition->as_expression();
938
939 if (expr && expr->operation != ir_binop_ubo_load) {
940 src_reg op[3];
941 dst_reg temp;
942
943 assert(expr->get_num_operands() <= 3);
944 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
945 expr->operands[i]->accept(this);
946 op[i] = this->result;
947 }
948
949 switch (expr->operation) {
950 case ir_unop_logic_not:
951 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
952 return;
953
954 case ir_binop_logic_xor:
955 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
956 return;
957
958 case ir_binop_logic_or:
959 temp = dst_reg(this, glsl_type::bool_type);
960 emit(OR(temp, op[0], op[1]));
961 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
962 return;
963
964 case ir_binop_logic_and:
965 temp = dst_reg(this, glsl_type::bool_type);
966 emit(AND(temp, op[0], op[1]));
967 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_unop_f2b:
971 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
972 return;
973
974 case ir_unop_i2b:
975 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
976 return;
977
978 case ir_binop_greater:
979 case ir_binop_gequal:
980 case ir_binop_less:
981 case ir_binop_lequal:
982 case ir_binop_equal:
983 case ir_binop_nequal:
984 emit(IF(op[0], op[1],
985 brw_conditional_for_comparison(expr->operation)));
986 return;
987
988 case ir_binop_all_equal:
989 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
990 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
991 return;
992
993 case ir_binop_any_nequal:
994 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
995 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
996 return;
997
998 case ir_unop_any:
999 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1000 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1001 return;
1002
1003 case ir_triop_csel: {
1004 /* Expand the boolean condition into the flag register. */
1005 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1006 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1007
1008 /* Select which boolean to return. */
1009 dst_reg temp(this, expr->operands[1]->type);
1010 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1011 inst->predicate = BRW_PREDICATE_NORMAL;
1012
1013 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1014 return;
1015 }
1016
1017 default:
1018 unreachable("not reached");
1019 }
1020 return;
1021 }
1022
1023 ir->condition->accept(this);
1024
1025 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_variable *ir)
1030 {
1031 dst_reg *reg = NULL;
1032
1033 if (variable_storage(ir))
1034 return;
1035
1036 switch (ir->data.mode) {
1037 case ir_var_shader_in:
1038 assert(ir->data.location != -1);
1039 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1040 break;
1041
1042 case ir_var_shader_out:
1043 assert(ir->data.location != -1);
1044 reg = new(mem_ctx) dst_reg(this, ir->type);
1045
1046 for (int i = 0; i < type_size(ir->type); i++) {
1047 output_reg[ir->data.location + i] = *reg;
1048 output_reg[ir->data.location + i].reg_offset = i;
1049 output_reg[ir->data.location + i].type =
1050 brw_type_for_base_type(ir->type->get_scalar_type());
1051 output_reg_annotation[ir->data.location + i] = ir->name;
1052 }
1053 break;
1054
1055 case ir_var_auto:
1056 case ir_var_temporary:
1057 reg = new(mem_ctx) dst_reg(this, ir->type);
1058 break;
1059
1060 case ir_var_uniform:
1061 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1062
1063 /* Thanks to the lower_ubo_reference pass, we will see only
1064 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1065 * variables, so no need for them to be in variable_ht.
1066 *
1067 * Some uniforms, such as samplers and atomic counters, have no actual
1068 * storage, so we should ignore them.
1069 */
1070 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1071 return;
1072
1073 /* Track how big the whole uniform variable is, in case we need to put a
1074 * copy of its data into pull constants for array access.
1075 */
1076 assert(this->uniforms < uniform_array_size);
1077 this->uniform_size[this->uniforms] = type_size(ir->type);
1078
1079 if (!strncmp(ir->name, "gl_", 3)) {
1080 setup_builtin_uniform_values(ir);
1081 } else {
1082 setup_uniform_values(ir);
1083 }
1084 break;
1085
1086 case ir_var_system_value:
1087 reg = make_reg_for_system_value(ir);
1088 break;
1089
1090 default:
1091 unreachable("not reached");
1092 }
1093
1094 reg->type = brw_type_for_base_type(ir->type);
1095 hash_table_insert(this->variable_ht, reg, ir);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop *ir)
1100 {
1101 /* We don't want debugging output to print the whole body of the
1102 * loop as the annotation.
1103 */
1104 this->base_ir = NULL;
1105
1106 emit(BRW_OPCODE_DO);
1107
1108 visit_instructions(&ir->body_instructions);
1109
1110 emit(BRW_OPCODE_WHILE);
1111 }
1112
1113 void
1114 vec4_visitor::visit(ir_loop_jump *ir)
1115 {
1116 switch (ir->mode) {
1117 case ir_loop_jump::jump_break:
1118 emit(BRW_OPCODE_BREAK);
1119 break;
1120 case ir_loop_jump::jump_continue:
1121 emit(BRW_OPCODE_CONTINUE);
1122 break;
1123 }
1124 }
1125
1126
1127 void
1128 vec4_visitor::visit(ir_function_signature *)
1129 {
1130 unreachable("not reached");
1131 }
1132
1133 void
1134 vec4_visitor::visit(ir_function *ir)
1135 {
1136 /* Ignore function bodies other than main() -- we shouldn't see calls to
1137 * them since they should all be inlined.
1138 */
1139 if (strcmp(ir->name, "main") == 0) {
1140 const ir_function_signature *sig;
1141 exec_list empty;
1142
1143 sig = ir->matching_signature(NULL, &empty, false);
1144
1145 assert(sig);
1146
1147 visit_instructions(&sig->body);
1148 }
1149 }
1150
1151 bool
1152 vec4_visitor::try_emit_mad(ir_expression *ir)
1153 {
1154 /* 3-src instructions were introduced in gen6. */
1155 if (brw->gen < 6)
1156 return false;
1157
1158 /* MAD can only handle floating-point data. */
1159 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1160 return false;
1161
1162 ir_rvalue *nonmul = ir->operands[1];
1163 ir_expression *mul = ir->operands[0]->as_expression();
1164
1165 bool mul_negate = false, mul_abs = false;
1166 if (mul && mul->operation == ir_unop_abs) {
1167 mul = mul->operands[0]->as_expression();
1168 mul_abs = true;
1169 } else if (mul && mul->operation == ir_unop_neg) {
1170 mul = mul->operands[0]->as_expression();
1171 mul_negate = true;
1172 }
1173
1174 if (!mul || mul->operation != ir_binop_mul) {
1175 nonmul = ir->operands[0];
1176 mul = ir->operands[1]->as_expression();
1177
1178 if (mul && mul->operation == ir_unop_abs) {
1179 mul = mul->operands[0]->as_expression();
1180 mul_abs = true;
1181 } else if (mul && mul->operation == ir_unop_neg) {
1182 mul = mul->operands[0]->as_expression();
1183 mul_negate = true;
1184 }
1185
1186 if (!mul || mul->operation != ir_binop_mul)
1187 return false;
1188 }
1189
1190 nonmul->accept(this);
1191 src_reg src0 = fix_3src_operand(this->result);
1192
1193 mul->operands[0]->accept(this);
1194 src_reg src1 = fix_3src_operand(this->result);
1195 src1.negate ^= mul_negate;
1196 src1.abs = mul_abs;
1197 if (mul_abs)
1198 src1.negate = false;
1199
1200 mul->operands[1]->accept(this);
1201 src_reg src2 = fix_3src_operand(this->result);
1202 src2.abs = mul_abs;
1203 if (mul_abs)
1204 src2.negate = false;
1205
1206 this->result = src_reg(this, ir->type);
1207 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1208
1209 return true;
1210 }
1211
1212 bool
1213 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1214 {
1215 /* This optimization relies on CMP setting the destination to 0 when
1216 * false. Early hardware only sets the least significant bit, and
1217 * leaves the other bits undefined. So we can't use it.
1218 */
1219 if (brw->gen < 6)
1220 return false;
1221
1222 ir_expression *const cmp = ir->operands[0]->as_expression();
1223
1224 if (cmp == NULL)
1225 return false;
1226
1227 switch (cmp->operation) {
1228 case ir_binop_less:
1229 case ir_binop_greater:
1230 case ir_binop_lequal:
1231 case ir_binop_gequal:
1232 case ir_binop_equal:
1233 case ir_binop_nequal:
1234 break;
1235
1236 default:
1237 return false;
1238 }
1239
1240 cmp->operands[0]->accept(this);
1241 const src_reg cmp_src0 = this->result;
1242
1243 cmp->operands[1]->accept(this);
1244 const src_reg cmp_src1 = this->result;
1245
1246 this->result = src_reg(this, ir->type);
1247
1248 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1249 brw_conditional_for_comparison(cmp->operation)));
1250
1251 /* If the comparison is false, this->result will just happen to be zero.
1252 */
1253 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1254 this->result, src_reg(1.0f));
1255 inst->predicate = BRW_PREDICATE_NORMAL;
1256 inst->predicate_inverse = true;
1257
1258 return true;
1259 }
1260
1261 void
1262 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1263 src_reg src0, src_reg src1)
1264 {
1265 vec4_instruction *inst;
1266
1267 if (brw->gen >= 6) {
1268 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1269 inst->conditional_mod = conditionalmod;
1270 } else {
1271 emit(CMP(dst, src0, src1, conditionalmod));
1272
1273 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1274 inst->predicate = BRW_PREDICATE_NORMAL;
1275 }
1276 }
1277
1278 void
1279 vec4_visitor::emit_lrp(const dst_reg &dst,
1280 const src_reg &x, const src_reg &y, const src_reg &a)
1281 {
1282 if (brw->gen >= 6) {
1283 /* Note that the instruction's argument order is reversed from GLSL
1284 * and the IR.
1285 */
1286 emit(LRP(dst,
1287 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1288 } else {
1289 /* Earlier generations don't support three source operations, so we
1290 * need to emit x*(1-a) + y*a.
1291 */
1292 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1293 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1294 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1295 y_times_a.writemask = dst.writemask;
1296 one_minus_a.writemask = dst.writemask;
1297 x_times_one_minus_a.writemask = dst.writemask;
1298
1299 emit(MUL(y_times_a, y, a));
1300 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1301 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1302 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1303 }
1304 }
1305
1306 void
1307 vec4_visitor::visit(ir_expression *ir)
1308 {
1309 unsigned int operand;
1310 src_reg op[Elements(ir->operands)];
1311 vec4_instruction *inst;
1312
1313 if (ir->operation == ir_binop_add) {
1314 if (try_emit_mad(ir))
1315 return;
1316 }
1317
1318 if (ir->operation == ir_unop_b2f) {
1319 if (try_emit_b2f_of_compare(ir))
1320 return;
1321 }
1322
1323 /* Storage for our result. Ideally for an assignment we'd be using
1324 * the actual storage for the result here, instead.
1325 */
1326 dst_reg result_dst(this, ir->type);
1327 src_reg result_src(result_dst);
1328
1329 if (ir->operation == ir_triop_csel) {
1330 ir->operands[1]->accept(this);
1331 op[1] = this->result;
1332 ir->operands[2]->accept(this);
1333 op[2] = this->result;
1334
1335 enum brw_predicate predicate;
1336 emit_bool_to_cond_code(ir->operands[0], &predicate);
1337 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1338 inst->predicate = predicate;
1339 this->result = result_src;
1340 return;
1341 }
1342
1343 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1344 this->result.file = BAD_FILE;
1345 ir->operands[operand]->accept(this);
1346 if (this->result.file == BAD_FILE) {
1347 fprintf(stderr, "Failed to get tree for expression operand:\n");
1348 ir->operands[operand]->fprint(stderr);
1349 exit(1);
1350 }
1351 op[operand] = this->result;
1352
1353 /* Matrix expression operands should have been broken down to vector
1354 * operations already.
1355 */
1356 assert(!ir->operands[operand]->type->is_matrix());
1357 }
1358
1359 /* If nothing special happens, this is the result. */
1360 this->result = result_src;
1361
1362 switch (ir->operation) {
1363 case ir_unop_logic_not:
1364 emit(NOT(result_dst, op[0]));
1365 break;
1366 case ir_unop_neg:
1367 op[0].negate = !op[0].negate;
1368 emit(MOV(result_dst, op[0]));
1369 break;
1370 case ir_unop_abs:
1371 op[0].abs = true;
1372 op[0].negate = false;
1373 emit(MOV(result_dst, op[0]));
1374 break;
1375
1376 case ir_unop_sign:
1377 if (ir->type->is_float()) {
1378 /* AND(val, 0x80000000) gives the sign bit.
1379 *
1380 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1381 * zero.
1382 */
1383 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1384
1385 op[0].type = BRW_REGISTER_TYPE_UD;
1386 result_dst.type = BRW_REGISTER_TYPE_UD;
1387 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1388
1389 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1390 inst->predicate = BRW_PREDICATE_NORMAL;
1391
1392 this->result.type = BRW_REGISTER_TYPE_F;
1393 } else {
1394 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1395 * -> non-negative val generates 0x00000000.
1396 * Predicated OR sets 1 if val is positive.
1397 */
1398 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1399
1400 emit(ASR(result_dst, op[0], src_reg(31)));
1401
1402 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1403 inst->predicate = BRW_PREDICATE_NORMAL;
1404 }
1405 break;
1406
1407 case ir_unop_rcp:
1408 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1409 break;
1410
1411 case ir_unop_exp2:
1412 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1413 break;
1414 case ir_unop_log2:
1415 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1416 break;
1417 case ir_unop_exp:
1418 case ir_unop_log:
1419 unreachable("not reached: should be handled by ir_explog_to_explog2");
1420 case ir_unop_sin:
1421 case ir_unop_sin_reduced:
1422 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1423 break;
1424 case ir_unop_cos:
1425 case ir_unop_cos_reduced:
1426 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1427 break;
1428
1429 case ir_unop_dFdx:
1430 case ir_unop_dFdx_coarse:
1431 case ir_unop_dFdx_fine:
1432 case ir_unop_dFdy:
1433 case ir_unop_dFdy_coarse:
1434 case ir_unop_dFdy_fine:
1435 unreachable("derivatives not valid in vertex shader");
1436
1437 case ir_unop_bitfield_reverse:
1438 emit(BFREV(result_dst, op[0]));
1439 break;
1440 case ir_unop_bit_count:
1441 emit(CBIT(result_dst, op[0]));
1442 break;
1443 case ir_unop_find_msb: {
1444 src_reg temp = src_reg(this, glsl_type::uint_type);
1445
1446 inst = emit(FBH(dst_reg(temp), op[0]));
1447 inst->dst.writemask = WRITEMASK_XYZW;
1448
1449 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1450 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1451 * subtract the result from 31 to convert the MSB count into an LSB count.
1452 */
1453
1454 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1455 temp.swizzle = BRW_SWIZZLE_NOOP;
1456 emit(MOV(result_dst, temp));
1457
1458 src_reg src_tmp = src_reg(result_dst);
1459 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1460
1461 src_tmp.negate = true;
1462 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1463 inst->predicate = BRW_PREDICATE_NORMAL;
1464 break;
1465 }
1466 case ir_unop_find_lsb:
1467 emit(FBL(result_dst, op[0]));
1468 break;
1469 case ir_unop_saturate:
1470 inst = emit(MOV(result_dst, op[0]));
1471 inst->saturate = true;
1472 break;
1473
1474 case ir_unop_noise:
1475 unreachable("not reached: should be handled by lower_noise");
1476
1477 case ir_binop_add:
1478 emit(ADD(result_dst, op[0], op[1]));
1479 break;
1480 case ir_binop_sub:
1481 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1482
1483 case ir_binop_mul:
1484 if (brw->gen < 8 && ir->type->is_integer()) {
1485 /* For integer multiplication, the MUL uses the low 16 bits of one of
1486 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1487 * accumulates in the contribution of the upper 16 bits of that
1488 * operand. If we can determine that one of the args is in the low
1489 * 16 bits, though, we can just emit a single MUL.
1490 */
1491 if (ir->operands[0]->is_uint16_constant()) {
1492 if (brw->gen < 7)
1493 emit(MUL(result_dst, op[0], op[1]));
1494 else
1495 emit(MUL(result_dst, op[1], op[0]));
1496 } else if (ir->operands[1]->is_uint16_constant()) {
1497 if (brw->gen < 7)
1498 emit(MUL(result_dst, op[1], op[0]));
1499 else
1500 emit(MUL(result_dst, op[0], op[1]));
1501 } else {
1502 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1503
1504 emit(MUL(acc, op[0], op[1]));
1505 emit(MACH(dst_null_d(), op[0], op[1]));
1506 emit(MOV(result_dst, src_reg(acc)));
1507 }
1508 } else {
1509 emit(MUL(result_dst, op[0], op[1]));
1510 }
1511 break;
1512 case ir_binop_imul_high: {
1513 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1514
1515 emit(MUL(acc, op[0], op[1]));
1516 emit(MACH(result_dst, op[0], op[1]));
1517 break;
1518 }
1519 case ir_binop_div:
1520 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1521 assert(ir->type->is_integer());
1522 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1523 break;
1524 case ir_binop_carry: {
1525 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1526
1527 emit(ADDC(dst_null_ud(), op[0], op[1]));
1528 emit(MOV(result_dst, src_reg(acc)));
1529 break;
1530 }
1531 case ir_binop_borrow: {
1532 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1533
1534 emit(SUBB(dst_null_ud(), op[0], op[1]));
1535 emit(MOV(result_dst, src_reg(acc)));
1536 break;
1537 }
1538 case ir_binop_mod:
1539 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1540 assert(ir->type->is_integer());
1541 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1542 break;
1543
1544 case ir_binop_less:
1545 case ir_binop_greater:
1546 case ir_binop_lequal:
1547 case ir_binop_gequal:
1548 case ir_binop_equal:
1549 case ir_binop_nequal: {
1550 if (brw->gen <= 5) {
1551 resolve_bool_comparison(ir->operands[0], &op[0]);
1552 resolve_bool_comparison(ir->operands[1], &op[1]);
1553 }
1554 emit(CMP(result_dst, op[0], op[1],
1555 brw_conditional_for_comparison(ir->operation)));
1556 break;
1557 }
1558
1559 case ir_binop_all_equal:
1560 /* "==" operator producing a scalar boolean. */
1561 if (ir->operands[0]->type->is_vector() ||
1562 ir->operands[1]->type->is_vector()) {
1563 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1564 emit(MOV(result_dst, src_reg(0)));
1565 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1566 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1567 } else {
1568 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1569 }
1570 break;
1571 case ir_binop_any_nequal:
1572 /* "!=" operator producing a scalar boolean. */
1573 if (ir->operands[0]->type->is_vector() ||
1574 ir->operands[1]->type->is_vector()) {
1575 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1576
1577 emit(MOV(result_dst, src_reg(0)));
1578 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1579 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1580 } else {
1581 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1582 }
1583 break;
1584
1585 case ir_unop_any:
1586 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1587 emit(MOV(result_dst, src_reg(0)));
1588
1589 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1590 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1591 break;
1592
1593 case ir_binop_logic_xor:
1594 emit(XOR(result_dst, op[0], op[1]));
1595 break;
1596
1597 case ir_binop_logic_or:
1598 emit(OR(result_dst, op[0], op[1]));
1599 break;
1600
1601 case ir_binop_logic_and:
1602 emit(AND(result_dst, op[0], op[1]));
1603 break;
1604
1605 case ir_binop_dot:
1606 assert(ir->operands[0]->type->is_vector());
1607 assert(ir->operands[0]->type == ir->operands[1]->type);
1608 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1609 break;
1610
1611 case ir_unop_sqrt:
1612 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1613 break;
1614 case ir_unop_rsq:
1615 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1616 break;
1617
1618 case ir_unop_bitcast_i2f:
1619 case ir_unop_bitcast_u2f:
1620 this->result = op[0];
1621 this->result.type = BRW_REGISTER_TYPE_F;
1622 break;
1623
1624 case ir_unop_bitcast_f2i:
1625 this->result = op[0];
1626 this->result.type = BRW_REGISTER_TYPE_D;
1627 break;
1628
1629 case ir_unop_bitcast_f2u:
1630 this->result = op[0];
1631 this->result.type = BRW_REGISTER_TYPE_UD;
1632 break;
1633
1634 case ir_unop_i2f:
1635 case ir_unop_i2u:
1636 case ir_unop_u2i:
1637 case ir_unop_u2f:
1638 case ir_unop_f2i:
1639 case ir_unop_f2u:
1640 emit(MOV(result_dst, op[0]));
1641 break;
1642 case ir_unop_b2i:
1643 emit(AND(result_dst, op[0], src_reg(1)));
1644 break;
1645 case ir_unop_b2f:
1646 if (brw->gen <= 5) {
1647 resolve_bool_comparison(ir->operands[0], &op[0]);
1648 }
1649 op[0].type = BRW_REGISTER_TYPE_D;
1650 result_dst.type = BRW_REGISTER_TYPE_D;
1651 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1652 result_dst.type = BRW_REGISTER_TYPE_F;
1653 break;
1654 case ir_unop_f2b:
1655 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1656 break;
1657 case ir_unop_i2b:
1658 emit(AND(result_dst, op[0], src_reg(1)));
1659 break;
1660
1661 case ir_unop_trunc:
1662 emit(RNDZ(result_dst, op[0]));
1663 break;
1664 case ir_unop_ceil: {
1665 src_reg tmp = src_reg(this, ir->type);
1666 op[0].negate = !op[0].negate;
1667 emit(RNDD(dst_reg(tmp), op[0]));
1668 tmp.negate = true;
1669 emit(MOV(result_dst, tmp));
1670 }
1671 break;
1672 case ir_unop_floor:
1673 inst = emit(RNDD(result_dst, op[0]));
1674 break;
1675 case ir_unop_fract:
1676 inst = emit(FRC(result_dst, op[0]));
1677 break;
1678 case ir_unop_round_even:
1679 emit(RNDE(result_dst, op[0]));
1680 break;
1681
1682 case ir_binop_min:
1683 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1684 break;
1685 case ir_binop_max:
1686 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1687 break;
1688
1689 case ir_binop_pow:
1690 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1691 break;
1692
1693 case ir_unop_bit_not:
1694 inst = emit(NOT(result_dst, op[0]));
1695 break;
1696 case ir_binop_bit_and:
1697 inst = emit(AND(result_dst, op[0], op[1]));
1698 break;
1699 case ir_binop_bit_xor:
1700 inst = emit(XOR(result_dst, op[0], op[1]));
1701 break;
1702 case ir_binop_bit_or:
1703 inst = emit(OR(result_dst, op[0], op[1]));
1704 break;
1705
1706 case ir_binop_lshift:
1707 inst = emit(SHL(result_dst, op[0], op[1]));
1708 break;
1709
1710 case ir_binop_rshift:
1711 if (ir->type->base_type == GLSL_TYPE_INT)
1712 inst = emit(ASR(result_dst, op[0], op[1]));
1713 else
1714 inst = emit(SHR(result_dst, op[0], op[1]));
1715 break;
1716
1717 case ir_binop_bfm:
1718 emit(BFI1(result_dst, op[0], op[1]));
1719 break;
1720
1721 case ir_binop_ubo_load: {
1722 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1723 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1724 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1725 src_reg offset;
1726
1727 /* Now, load the vector from that offset. */
1728 assert(ir->type->is_vector() || ir->type->is_scalar());
1729
1730 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1731 packed_consts.type = result.type;
1732 src_reg surf_index;
1733
1734 if (const_uniform_block) {
1735 /* The block index is a constant, so just emit the binding table entry
1736 * as an immediate.
1737 */
1738 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1739 const_uniform_block->value.u[0]);
1740 } else {
1741 /* The block index is not a constant. Evaluate the index expression
1742 * per-channel and add the base UBO index; the generator will select
1743 * a value from any live channel.
1744 */
1745 surf_index = src_reg(this, glsl_type::uint_type);
1746 emit(ADD(dst_reg(surf_index), op[0],
1747 src_reg(prog_data->base.binding_table.ubo_start)));
1748
1749 /* Assume this may touch any UBO. It would be nice to provide
1750 * a tighter bound, but the array information is already lowered away.
1751 */
1752 brw_mark_surface_used(&prog_data->base,
1753 prog_data->base.binding_table.ubo_start +
1754 shader_prog->NumUniformBlocks - 1);
1755 }
1756
1757 if (const_offset_ir) {
1758 if (brw->gen >= 8) {
1759 /* Store the offset in a GRF so we can send-from-GRF. */
1760 offset = src_reg(this, glsl_type::int_type);
1761 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1762 } else {
1763 /* Immediates are fine on older generations since they'll be moved
1764 * to a (potentially fake) MRF at the generator level.
1765 */
1766 offset = src_reg(const_offset / 16);
1767 }
1768 } else {
1769 offset = src_reg(this, glsl_type::uint_type);
1770 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1771 }
1772
1773 if (brw->gen >= 7) {
1774 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1775 grf_offset.type = offset.type;
1776
1777 emit(MOV(grf_offset, offset));
1778
1779 vec4_instruction *pull =
1780 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1781 dst_reg(packed_consts),
1782 surf_index,
1783 src_reg(grf_offset)));
1784 pull->mlen = 1;
1785 } else {
1786 vec4_instruction *pull =
1787 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1788 dst_reg(packed_consts),
1789 surf_index,
1790 offset));
1791 pull->base_mrf = 14;
1792 pull->mlen = 1;
1793 }
1794
1795 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1796 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1797 const_offset % 16 / 4,
1798 const_offset % 16 / 4,
1799 const_offset % 16 / 4);
1800
1801 /* UBO bools are any nonzero int. We need to convert them to use the
1802 * value of true stored in ctx->Const.UniformBooleanTrue.
1803 */
1804 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1805 emit(CMP(result_dst, packed_consts, src_reg(0u),
1806 BRW_CONDITIONAL_NZ));
1807 } else {
1808 emit(MOV(result_dst, packed_consts));
1809 }
1810 break;
1811 }
1812
1813 case ir_binop_vector_extract:
1814 unreachable("should have been lowered by vec_index_to_cond_assign");
1815
1816 case ir_triop_fma:
1817 op[0] = fix_3src_operand(op[0]);
1818 op[1] = fix_3src_operand(op[1]);
1819 op[2] = fix_3src_operand(op[2]);
1820 /* Note that the instruction's argument order is reversed from GLSL
1821 * and the IR.
1822 */
1823 emit(MAD(result_dst, op[2], op[1], op[0]));
1824 break;
1825
1826 case ir_triop_lrp:
1827 emit_lrp(result_dst, op[0], op[1], op[2]);
1828 break;
1829
1830 case ir_triop_csel:
1831 unreachable("already handled above");
1832 break;
1833
1834 case ir_triop_bfi:
1835 op[0] = fix_3src_operand(op[0]);
1836 op[1] = fix_3src_operand(op[1]);
1837 op[2] = fix_3src_operand(op[2]);
1838 emit(BFI2(result_dst, op[0], op[1], op[2]));
1839 break;
1840
1841 case ir_triop_bitfield_extract:
1842 op[0] = fix_3src_operand(op[0]);
1843 op[1] = fix_3src_operand(op[1]);
1844 op[2] = fix_3src_operand(op[2]);
1845 /* Note that the instruction's argument order is reversed from GLSL
1846 * and the IR.
1847 */
1848 emit(BFE(result_dst, op[2], op[1], op[0]));
1849 break;
1850
1851 case ir_triop_vector_insert:
1852 unreachable("should have been lowered by lower_vector_insert");
1853
1854 case ir_quadop_bitfield_insert:
1855 unreachable("not reached: should be handled by "
1856 "bitfield_insert_to_bfm_bfi\n");
1857
1858 case ir_quadop_vector:
1859 unreachable("not reached: should be handled by lower_quadop_vector");
1860
1861 case ir_unop_pack_half_2x16:
1862 emit_pack_half_2x16(result_dst, op[0]);
1863 break;
1864 case ir_unop_unpack_half_2x16:
1865 emit_unpack_half_2x16(result_dst, op[0]);
1866 break;
1867 case ir_unop_unpack_unorm_4x8:
1868 emit_unpack_unorm_4x8(result_dst, op[0]);
1869 break;
1870 case ir_unop_unpack_snorm_4x8:
1871 emit_unpack_snorm_4x8(result_dst, op[0]);
1872 break;
1873 case ir_unop_pack_unorm_4x8:
1874 emit_pack_unorm_4x8(result_dst, op[0]);
1875 break;
1876 case ir_unop_pack_snorm_4x8:
1877 emit_pack_snorm_4x8(result_dst, op[0]);
1878 break;
1879 case ir_unop_pack_snorm_2x16:
1880 case ir_unop_pack_unorm_2x16:
1881 case ir_unop_unpack_snorm_2x16:
1882 case ir_unop_unpack_unorm_2x16:
1883 unreachable("not reached: should be handled by lower_packing_builtins");
1884 case ir_unop_unpack_half_2x16_split_x:
1885 case ir_unop_unpack_half_2x16_split_y:
1886 case ir_binop_pack_half_2x16_split:
1887 case ir_unop_interpolate_at_centroid:
1888 case ir_binop_interpolate_at_sample:
1889 case ir_binop_interpolate_at_offset:
1890 unreachable("not reached: should not occur in vertex shader");
1891 case ir_binop_ldexp:
1892 unreachable("not reached: should be handled by ldexp_to_arith()");
1893 case ir_unop_d2f:
1894 case ir_unop_f2d:
1895 case ir_unop_d2i:
1896 case ir_unop_i2d:
1897 case ir_unop_d2u:
1898 case ir_unop_u2d:
1899 case ir_unop_d2b:
1900 case ir_unop_pack_double_2x32:
1901 case ir_unop_unpack_double_2x32:
1902 case ir_unop_frexp_sig:
1903 case ir_unop_frexp_exp:
1904 unreachable("fp64 todo");
1905 }
1906 }
1907
1908
1909 void
1910 vec4_visitor::visit(ir_swizzle *ir)
1911 {
1912 src_reg src;
1913 int i = 0;
1914 int swizzle[4];
1915
1916 /* Note that this is only swizzles in expressions, not those on the left
1917 * hand side of an assignment, which do write masking. See ir_assignment
1918 * for that.
1919 */
1920
1921 ir->val->accept(this);
1922 src = this->result;
1923 assert(src.file != BAD_FILE);
1924
1925 for (i = 0; i < ir->type->vector_elements; i++) {
1926 switch (i) {
1927 case 0:
1928 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1929 break;
1930 case 1:
1931 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1932 break;
1933 case 2:
1934 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1935 break;
1936 case 3:
1937 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1938 break;
1939 }
1940 }
1941 for (; i < 4; i++) {
1942 /* Replicate the last channel out. */
1943 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1944 }
1945
1946 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1947
1948 this->result = src;
1949 }
1950
1951 void
1952 vec4_visitor::visit(ir_dereference_variable *ir)
1953 {
1954 const struct glsl_type *type = ir->type;
1955 dst_reg *reg = variable_storage(ir->var);
1956
1957 if (!reg) {
1958 fail("Failed to find variable storage for %s\n", ir->var->name);
1959 this->result = src_reg(brw_null_reg());
1960 return;
1961 }
1962
1963 this->result = src_reg(*reg);
1964
1965 /* System values get their swizzle from the dst_reg writemask */
1966 if (ir->var->data.mode == ir_var_system_value)
1967 return;
1968
1969 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1970 this->result.swizzle = swizzle_for_size(type->vector_elements);
1971 }
1972
1973
1974 int
1975 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1976 {
1977 /* Under normal circumstances array elements are stored consecutively, so
1978 * the stride is equal to the size of the array element.
1979 */
1980 return type_size(ir->type);
1981 }
1982
1983
1984 void
1985 vec4_visitor::visit(ir_dereference_array *ir)
1986 {
1987 ir_constant *constant_index;
1988 src_reg src;
1989 int array_stride = compute_array_stride(ir);
1990
1991 constant_index = ir->array_index->constant_expression_value();
1992
1993 ir->array->accept(this);
1994 src = this->result;
1995
1996 if (constant_index) {
1997 src.reg_offset += constant_index->value.i[0] * array_stride;
1998 } else {
1999 /* Variable index array dereference. It eats the "vec4" of the
2000 * base of the array and an index that offsets the Mesa register
2001 * index.
2002 */
2003 ir->array_index->accept(this);
2004
2005 src_reg index_reg;
2006
2007 if (array_stride == 1) {
2008 index_reg = this->result;
2009 } else {
2010 index_reg = src_reg(this, glsl_type::int_type);
2011
2012 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2013 }
2014
2015 if (src.reladdr) {
2016 src_reg temp = src_reg(this, glsl_type::int_type);
2017
2018 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2019
2020 index_reg = temp;
2021 }
2022
2023 src.reladdr = ralloc(mem_ctx, src_reg);
2024 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2025 }
2026
2027 /* If the type is smaller than a vec4, replicate the last channel out. */
2028 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2029 src.swizzle = swizzle_for_size(ir->type->vector_elements);
2030 else
2031 src.swizzle = BRW_SWIZZLE_NOOP;
2032 src.type = brw_type_for_base_type(ir->type);
2033
2034 this->result = src;
2035 }
2036
2037 void
2038 vec4_visitor::visit(ir_dereference_record *ir)
2039 {
2040 unsigned int i;
2041 const glsl_type *struct_type = ir->record->type;
2042 int offset = 0;
2043
2044 ir->record->accept(this);
2045
2046 for (i = 0; i < struct_type->length; i++) {
2047 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2048 break;
2049 offset += type_size(struct_type->fields.structure[i].type);
2050 }
2051
2052 /* If the type is smaller than a vec4, replicate the last channel out. */
2053 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2054 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2055 else
2056 this->result.swizzle = BRW_SWIZZLE_NOOP;
2057 this->result.type = brw_type_for_base_type(ir->type);
2058
2059 this->result.reg_offset += offset;
2060 }
2061
2062 /**
2063 * We want to be careful in assignment setup to hit the actual storage
2064 * instead of potentially using a temporary like we might with the
2065 * ir_dereference handler.
2066 */
2067 static dst_reg
2068 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2069 {
2070 /* The LHS must be a dereference. If the LHS is a variable indexed array
2071 * access of a vector, it must be separated into a series conditional moves
2072 * before reaching this point (see ir_vec_index_to_cond_assign).
2073 */
2074 assert(ir->as_dereference());
2075 ir_dereference_array *deref_array = ir->as_dereference_array();
2076 if (deref_array) {
2077 assert(!deref_array->array->type->is_vector());
2078 }
2079
2080 /* Use the rvalue deref handler for the most part. We'll ignore
2081 * swizzles in it and write swizzles using writemask, though.
2082 */
2083 ir->accept(v);
2084 return dst_reg(v->result);
2085 }
2086
2087 void
2088 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2089 const struct glsl_type *type,
2090 enum brw_predicate predicate)
2091 {
2092 if (type->base_type == GLSL_TYPE_STRUCT) {
2093 for (unsigned int i = 0; i < type->length; i++) {
2094 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2095 }
2096 return;
2097 }
2098
2099 if (type->is_array()) {
2100 for (unsigned int i = 0; i < type->length; i++) {
2101 emit_block_move(dst, src, type->fields.array, predicate);
2102 }
2103 return;
2104 }
2105
2106 if (type->is_matrix()) {
2107 const struct glsl_type *vec_type;
2108
2109 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2110 type->vector_elements, 1);
2111
2112 for (int i = 0; i < type->matrix_columns; i++) {
2113 emit_block_move(dst, src, vec_type, predicate);
2114 }
2115 return;
2116 }
2117
2118 assert(type->is_scalar() || type->is_vector());
2119
2120 dst->type = brw_type_for_base_type(type);
2121 src->type = dst->type;
2122
2123 dst->writemask = (1 << type->vector_elements) - 1;
2124
2125 src->swizzle = swizzle_for_size(type->vector_elements);
2126
2127 vec4_instruction *inst = emit(MOV(*dst, *src));
2128 inst->predicate = predicate;
2129
2130 dst->reg_offset++;
2131 src->reg_offset++;
2132 }
2133
2134
2135 /* If the RHS processing resulted in an instruction generating a
2136 * temporary value, and it would be easy to rewrite the instruction to
2137 * generate its result right into the LHS instead, do so. This ends
2138 * up reliably removing instructions where it can be tricky to do so
2139 * later without real UD chain information.
2140 */
2141 bool
2142 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2143 dst_reg dst,
2144 src_reg src,
2145 vec4_instruction *pre_rhs_inst,
2146 vec4_instruction *last_rhs_inst)
2147 {
2148 /* This could be supported, but it would take more smarts. */
2149 if (ir->condition)
2150 return false;
2151
2152 if (pre_rhs_inst == last_rhs_inst)
2153 return false; /* No instructions generated to work with. */
2154
2155 /* Make sure the last instruction generated our source reg. */
2156 if (src.file != GRF ||
2157 src.file != last_rhs_inst->dst.file ||
2158 src.reg != last_rhs_inst->dst.reg ||
2159 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2160 src.reladdr ||
2161 src.abs ||
2162 src.negate ||
2163 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2164 return false;
2165
2166 /* Check that that last instruction fully initialized the channels
2167 * we want to use, in the order we want to use them. We could
2168 * potentially reswizzle the operands of many instructions so that
2169 * we could handle out of order channels, but don't yet.
2170 */
2171
2172 for (unsigned i = 0; i < 4; i++) {
2173 if (dst.writemask & (1 << i)) {
2174 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2175 return false;
2176
2177 if (BRW_GET_SWZ(src.swizzle, i) != i)
2178 return false;
2179 }
2180 }
2181
2182 /* Success! Rewrite the instruction. */
2183 last_rhs_inst->dst.file = dst.file;
2184 last_rhs_inst->dst.reg = dst.reg;
2185 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2186 last_rhs_inst->dst.reladdr = dst.reladdr;
2187 last_rhs_inst->dst.writemask &= dst.writemask;
2188
2189 return true;
2190 }
2191
2192 void
2193 vec4_visitor::visit(ir_assignment *ir)
2194 {
2195 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2196 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2197
2198 if (!ir->lhs->type->is_scalar() &&
2199 !ir->lhs->type->is_vector()) {
2200 ir->rhs->accept(this);
2201 src_reg src = this->result;
2202
2203 if (ir->condition) {
2204 emit_bool_to_cond_code(ir->condition, &predicate);
2205 }
2206
2207 /* emit_block_move doesn't account for swizzles in the source register.
2208 * This should be ok, since the source register is a structure or an
2209 * array, and those can't be swizzled. But double-check to be sure.
2210 */
2211 assert(src.swizzle ==
2212 (ir->rhs->type->is_matrix()
2213 ? swizzle_for_size(ir->rhs->type->vector_elements)
2214 : BRW_SWIZZLE_NOOP));
2215
2216 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2217 return;
2218 }
2219
2220 /* Now we're down to just a scalar/vector with writemasks. */
2221 int i;
2222
2223 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2224 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2225
2226 ir->rhs->accept(this);
2227
2228 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2229
2230 src_reg src = this->result;
2231
2232 int swizzles[4];
2233 int first_enabled_chan = 0;
2234 int src_chan = 0;
2235
2236 assert(ir->lhs->type->is_vector() ||
2237 ir->lhs->type->is_scalar());
2238 dst.writemask = ir->write_mask;
2239
2240 for (int i = 0; i < 4; i++) {
2241 if (dst.writemask & (1 << i)) {
2242 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2243 break;
2244 }
2245 }
2246
2247 /* Swizzle a small RHS vector into the channels being written.
2248 *
2249 * glsl ir treats write_mask as dictating how many channels are
2250 * present on the RHS while in our instructions we need to make
2251 * those channels appear in the slots of the vec4 they're written to.
2252 */
2253 for (int i = 0; i < 4; i++) {
2254 if (dst.writemask & (1 << i))
2255 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2256 else
2257 swizzles[i] = first_enabled_chan;
2258 }
2259 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2260 swizzles[2], swizzles[3]);
2261
2262 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2263 return;
2264 }
2265
2266 if (ir->condition) {
2267 emit_bool_to_cond_code(ir->condition, &predicate);
2268 }
2269
2270 for (i = 0; i < type_size(ir->lhs->type); i++) {
2271 vec4_instruction *inst = emit(MOV(dst, src));
2272 inst->predicate = predicate;
2273
2274 dst.reg_offset++;
2275 src.reg_offset++;
2276 }
2277 }
2278
2279 void
2280 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2281 {
2282 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2283 foreach_in_list(ir_constant, field_value, &ir->components) {
2284 emit_constant_values(dst, field_value);
2285 }
2286 return;
2287 }
2288
2289 if (ir->type->is_array()) {
2290 for (unsigned int i = 0; i < ir->type->length; i++) {
2291 emit_constant_values(dst, ir->array_elements[i]);
2292 }
2293 return;
2294 }
2295
2296 if (ir->type->is_matrix()) {
2297 for (int i = 0; i < ir->type->matrix_columns; i++) {
2298 float *vec = &ir->value.f[i * ir->type->vector_elements];
2299
2300 for (int j = 0; j < ir->type->vector_elements; j++) {
2301 dst->writemask = 1 << j;
2302 dst->type = BRW_REGISTER_TYPE_F;
2303
2304 emit(MOV(*dst, src_reg(vec[j])));
2305 }
2306 dst->reg_offset++;
2307 }
2308 return;
2309 }
2310
2311 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2312
2313 for (int i = 0; i < ir->type->vector_elements; i++) {
2314 if (!(remaining_writemask & (1 << i)))
2315 continue;
2316
2317 dst->writemask = 1 << i;
2318 dst->type = brw_type_for_base_type(ir->type);
2319
2320 /* Find other components that match the one we're about to
2321 * write. Emits fewer instructions for things like vec4(0.5,
2322 * 1.5, 1.5, 1.5).
2323 */
2324 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2325 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2326 if (ir->value.b[i] == ir->value.b[j])
2327 dst->writemask |= (1 << j);
2328 } else {
2329 /* u, i, and f storage all line up, so no need for a
2330 * switch case for comparing each type.
2331 */
2332 if (ir->value.u[i] == ir->value.u[j])
2333 dst->writemask |= (1 << j);
2334 }
2335 }
2336
2337 switch (ir->type->base_type) {
2338 case GLSL_TYPE_FLOAT:
2339 emit(MOV(*dst, src_reg(ir->value.f[i])));
2340 break;
2341 case GLSL_TYPE_INT:
2342 emit(MOV(*dst, src_reg(ir->value.i[i])));
2343 break;
2344 case GLSL_TYPE_UINT:
2345 emit(MOV(*dst, src_reg(ir->value.u[i])));
2346 break;
2347 case GLSL_TYPE_BOOL:
2348 emit(MOV(*dst,
2349 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2350 : 0)));
2351 break;
2352 default:
2353 unreachable("Non-float/uint/int/bool constant");
2354 }
2355
2356 remaining_writemask &= ~dst->writemask;
2357 }
2358 dst->reg_offset++;
2359 }
2360
2361 void
2362 vec4_visitor::visit(ir_constant *ir)
2363 {
2364 dst_reg dst = dst_reg(this, ir->type);
2365 this->result = src_reg(dst);
2366
2367 emit_constant_values(&dst, ir);
2368 }
2369
2370 void
2371 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2372 {
2373 ir_dereference *deref = static_cast<ir_dereference *>(
2374 ir->actual_parameters.get_head());
2375 ir_variable *location = deref->variable_referenced();
2376 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2377 location->data.binding);
2378
2379 /* Calculate the surface offset */
2380 src_reg offset(this, glsl_type::uint_type);
2381 ir_dereference_array *deref_array = deref->as_dereference_array();
2382 if (deref_array) {
2383 deref_array->array_index->accept(this);
2384
2385 src_reg tmp(this, glsl_type::uint_type);
2386 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2387 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2388 } else {
2389 offset = location->data.atomic.offset;
2390 }
2391
2392 /* Emit the appropriate machine instruction */
2393 const char *callee = ir->callee->function_name();
2394 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2395
2396 if (!strcmp("__intrinsic_atomic_read", callee)) {
2397 emit_untyped_surface_read(surf_index, dst, offset);
2398
2399 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2400 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2401 src_reg(), src_reg());
2402
2403 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2404 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2405 src_reg(), src_reg());
2406 }
2407 }
2408
2409 void
2410 vec4_visitor::visit(ir_call *ir)
2411 {
2412 const char *callee = ir->callee->function_name();
2413
2414 if (!strcmp("__intrinsic_atomic_read", callee) ||
2415 !strcmp("__intrinsic_atomic_increment", callee) ||
2416 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2417 visit_atomic_counter_intrinsic(ir);
2418 } else {
2419 unreachable("Unsupported intrinsic.");
2420 }
2421 }
2422
2423 src_reg
2424 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2425 {
2426 vec4_instruction *inst =
2427 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2428 dst_reg(this, glsl_type::uvec4_type));
2429 inst->base_mrf = 2;
2430 inst->mlen = 1;
2431 inst->src[1] = sampler;
2432
2433 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2434 int param_base = inst->base_mrf;
2435 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2436 int zero_mask = 0xf & ~coord_mask;
2437
2438 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2439 coordinate));
2440
2441 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2442 src_reg(0)));
2443
2444 emit(inst);
2445 return src_reg(inst->dst);
2446 }
2447
2448 static bool
2449 is_high_sampler(struct brw_context *brw, src_reg sampler)
2450 {
2451 if (brw->gen < 8 && !brw->is_haswell)
2452 return false;
2453
2454 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2455 }
2456
2457 void
2458 vec4_visitor::visit(ir_texture *ir)
2459 {
2460 uint32_t sampler =
2461 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2462
2463 ir_rvalue *nonconst_sampler_index =
2464 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2465
2466 /* Handle non-constant sampler array indexing */
2467 src_reg sampler_reg;
2468 if (nonconst_sampler_index) {
2469 /* The highest sampler which may be used by this operation is
2470 * the last element of the array. Mark it here, because the generator
2471 * doesn't have enough information to determine the bound.
2472 */
2473 uint32_t array_size = ir->sampler->as_dereference_array()
2474 ->array->type->array_size();
2475
2476 uint32_t max_used = sampler + array_size - 1;
2477 if (ir->op == ir_tg4 && brw->gen < 8) {
2478 max_used += prog_data->base.binding_table.gather_texture_start;
2479 } else {
2480 max_used += prog_data->base.binding_table.texture_start;
2481 }
2482
2483 brw_mark_surface_used(&prog_data->base, max_used);
2484
2485 /* Emit code to evaluate the actual indexing expression */
2486 nonconst_sampler_index->accept(this);
2487 dst_reg temp(this, glsl_type::uint_type);
2488 emit(ADD(temp, this->result, src_reg(sampler)))
2489 ->force_writemask_all = true;
2490 sampler_reg = src_reg(temp);
2491 } else {
2492 /* Single sampler, or constant array index; the indexing expression
2493 * is just an immediate.
2494 */
2495 sampler_reg = src_reg(sampler);
2496 }
2497
2498 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2499 * emitting anything other than setting up the constant result.
2500 */
2501 if (ir->op == ir_tg4) {
2502 ir_constant *chan = ir->lod_info.component->as_constant();
2503 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2504 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2505 dst_reg result(this, ir->type);
2506 this->result = src_reg(result);
2507 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2508 return;
2509 }
2510 }
2511
2512 /* Should be lowered by do_lower_texture_projection */
2513 assert(!ir->projector);
2514
2515 /* Should be lowered */
2516 assert(!ir->offset || !ir->offset->type->is_array());
2517
2518 /* Generate code to compute all the subexpression trees. This has to be
2519 * done before loading any values into MRFs for the sampler message since
2520 * generating these values may involve SEND messages that need the MRFs.
2521 */
2522 src_reg coordinate;
2523 if (ir->coordinate) {
2524 ir->coordinate->accept(this);
2525 coordinate = this->result;
2526 }
2527
2528 src_reg shadow_comparitor;
2529 if (ir->shadow_comparitor) {
2530 ir->shadow_comparitor->accept(this);
2531 shadow_comparitor = this->result;
2532 }
2533
2534 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2535 src_reg offset_value;
2536 if (has_nonconstant_offset) {
2537 ir->offset->accept(this);
2538 offset_value = src_reg(this->result);
2539 }
2540
2541 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2542 src_reg lod, dPdx, dPdy, sample_index, mcs;
2543 switch (ir->op) {
2544 case ir_tex:
2545 lod = src_reg(0.0f);
2546 lod_type = glsl_type::float_type;
2547 break;
2548 case ir_txf:
2549 case ir_txl:
2550 case ir_txs:
2551 ir->lod_info.lod->accept(this);
2552 lod = this->result;
2553 lod_type = ir->lod_info.lod->type;
2554 break;
2555 case ir_query_levels:
2556 lod = src_reg(0);
2557 lod_type = glsl_type::int_type;
2558 break;
2559 case ir_txf_ms:
2560 ir->lod_info.sample_index->accept(this);
2561 sample_index = this->result;
2562 sample_index_type = ir->lod_info.sample_index->type;
2563
2564 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2565 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2566 else
2567 mcs = src_reg(0u);
2568 break;
2569 case ir_txd:
2570 ir->lod_info.grad.dPdx->accept(this);
2571 dPdx = this->result;
2572
2573 ir->lod_info.grad.dPdy->accept(this);
2574 dPdy = this->result;
2575
2576 lod_type = ir->lod_info.grad.dPdx->type;
2577 break;
2578 case ir_txb:
2579 case ir_lod:
2580 case ir_tg4:
2581 break;
2582 }
2583
2584 enum opcode opcode;
2585 switch (ir->op) {
2586 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2587 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2588 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2589 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2590 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2591 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2592 case ir_tg4: opcode = has_nonconstant_offset
2593 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2594 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2595 case ir_txb:
2596 unreachable("TXB is not valid for vertex shaders.");
2597 case ir_lod:
2598 unreachable("LOD is not valid for vertex shaders.");
2599 default:
2600 unreachable("Unrecognized tex op");
2601 }
2602
2603 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2604 opcode, dst_reg(this, ir->type));
2605
2606 if (ir->offset != NULL && !has_nonconstant_offset) {
2607 inst->offset =
2608 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2609 ir->offset->type->vector_elements);
2610 }
2611
2612 /* Stuff the channel select bits in the top of the texture offset */
2613 if (ir->op == ir_tg4)
2614 inst->offset |= gather_channel(ir, sampler) << 16;
2615
2616 /* The message header is necessary for:
2617 * - Gen4 (always)
2618 * - Gen9+ for selecting SIMD4x2
2619 * - Texel offsets
2620 * - Gather channel selection
2621 * - Sampler indices too large to fit in a 4-bit value.
2622 */
2623 inst->header_present =
2624 brw->gen < 5 || brw->gen >= 9 ||
2625 inst->offset != 0 || ir->op == ir_tg4 ||
2626 is_high_sampler(brw, sampler_reg);
2627 inst->base_mrf = 2;
2628 inst->mlen = inst->header_present + 1; /* always at least one */
2629 inst->dst.writemask = WRITEMASK_XYZW;
2630 inst->shadow_compare = ir->shadow_comparitor != NULL;
2631
2632 inst->src[1] = sampler_reg;
2633
2634 /* MRF for the first parameter */
2635 int param_base = inst->base_mrf + inst->header_present;
2636
2637 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2638 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2639 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2640 } else {
2641 /* Load the coordinate */
2642 /* FINISHME: gl_clamp_mask and saturate */
2643 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2644 int zero_mask = 0xf & ~coord_mask;
2645
2646 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2647 coordinate));
2648
2649 if (zero_mask != 0) {
2650 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2651 src_reg(0)));
2652 }
2653 /* Load the shadow comparitor */
2654 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2655 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2656 WRITEMASK_X),
2657 shadow_comparitor));
2658 inst->mlen++;
2659 }
2660
2661 /* Load the LOD info */
2662 if (ir->op == ir_tex || ir->op == ir_txl) {
2663 int mrf, writemask;
2664 if (brw->gen >= 5) {
2665 mrf = param_base + 1;
2666 if (ir->shadow_comparitor) {
2667 writemask = WRITEMASK_Y;
2668 /* mlen already incremented */
2669 } else {
2670 writemask = WRITEMASK_X;
2671 inst->mlen++;
2672 }
2673 } else /* brw->gen == 4 */ {
2674 mrf = param_base;
2675 writemask = WRITEMASK_W;
2676 }
2677 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2678 } else if (ir->op == ir_txf) {
2679 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2680 } else if (ir->op == ir_txf_ms) {
2681 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2682 sample_index));
2683 if (brw->gen >= 7) {
2684 /* MCS data is in the first channel of `mcs`, but we need to get it into
2685 * the .y channel of the second vec4 of params, so replicate .x across
2686 * the whole vec4 and then mask off everything except .y
2687 */
2688 mcs.swizzle = BRW_SWIZZLE_XXXX;
2689 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2690 mcs));
2691 }
2692 inst->mlen++;
2693 } else if (ir->op == ir_txd) {
2694 const glsl_type *type = lod_type;
2695
2696 if (brw->gen >= 5) {
2697 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2698 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2699 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2700 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2701 inst->mlen++;
2702
2703 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2704 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2705 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2706 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2707 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2708 inst->mlen++;
2709
2710 if (ir->shadow_comparitor) {
2711 emit(MOV(dst_reg(MRF, param_base + 2,
2712 ir->shadow_comparitor->type, WRITEMASK_Z),
2713 shadow_comparitor));
2714 }
2715 }
2716 } else /* brw->gen == 4 */ {
2717 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2718 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2719 inst->mlen += 2;
2720 }
2721 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2722 if (ir->shadow_comparitor) {
2723 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2724 shadow_comparitor));
2725 }
2726
2727 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2728 offset_value));
2729 inst->mlen++;
2730 }
2731 }
2732
2733 emit(inst);
2734
2735 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2736 * spec requires layers.
2737 */
2738 if (ir->op == ir_txs) {
2739 glsl_type const *type = ir->sampler->type;
2740 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2741 type->sampler_array) {
2742 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2743 writemask(inst->dst, WRITEMASK_Z),
2744 src_reg(inst->dst), src_reg(6));
2745 }
2746 }
2747
2748 if (brw->gen == 6 && ir->op == ir_tg4) {
2749 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2750 }
2751
2752 swizzle_result(ir, src_reg(inst->dst), sampler);
2753 }
2754
2755 /**
2756 * Apply workarounds for Gen6 gather with UINT/SINT
2757 */
2758 void
2759 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2760 {
2761 if (!wa)
2762 return;
2763
2764 int width = (wa & WA_8BIT) ? 8 : 16;
2765 dst_reg dst_f = dst;
2766 dst_f.type = BRW_REGISTER_TYPE_F;
2767
2768 /* Convert from UNORM to UINT */
2769 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2770 emit(MOV(dst, src_reg(dst_f)));
2771
2772 if (wa & WA_SIGN) {
2773 /* Reinterpret the UINT value as a signed INT value by
2774 * shifting the sign bit into place, then shifting back
2775 * preserving sign.
2776 */
2777 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2778 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2779 }
2780 }
2781
2782 /**
2783 * Set up the gather channel based on the swizzle, for gather4.
2784 */
2785 uint32_t
2786 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2787 {
2788 ir_constant *chan = ir->lod_info.component->as_constant();
2789 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2790 switch (swiz) {
2791 case SWIZZLE_X: return 0;
2792 case SWIZZLE_Y:
2793 /* gather4 sampler is broken for green channel on RG32F --
2794 * we must ask for blue instead.
2795 */
2796 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2797 return 2;
2798 return 1;
2799 case SWIZZLE_Z: return 2;
2800 case SWIZZLE_W: return 3;
2801 default:
2802 unreachable("Not reached"); /* zero, one swizzles handled already */
2803 }
2804 }
2805
2806 void
2807 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2808 {
2809 int s = key->tex.swizzles[sampler];
2810
2811 this->result = src_reg(this, ir->type);
2812 dst_reg swizzled_result(this->result);
2813
2814 if (ir->op == ir_query_levels) {
2815 /* # levels is in .w */
2816 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2817 emit(MOV(swizzled_result, orig_val));
2818 return;
2819 }
2820
2821 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2822 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2823 emit(MOV(swizzled_result, orig_val));
2824 return;
2825 }
2826
2827
2828 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2829 int swizzle[4] = {0};
2830
2831 for (int i = 0; i < 4; i++) {
2832 switch (GET_SWZ(s, i)) {
2833 case SWIZZLE_ZERO:
2834 zero_mask |= (1 << i);
2835 break;
2836 case SWIZZLE_ONE:
2837 one_mask |= (1 << i);
2838 break;
2839 default:
2840 copy_mask |= (1 << i);
2841 swizzle[i] = GET_SWZ(s, i);
2842 break;
2843 }
2844 }
2845
2846 if (copy_mask) {
2847 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2848 swizzled_result.writemask = copy_mask;
2849 emit(MOV(swizzled_result, orig_val));
2850 }
2851
2852 if (zero_mask) {
2853 swizzled_result.writemask = zero_mask;
2854 emit(MOV(swizzled_result, src_reg(0.0f)));
2855 }
2856
2857 if (one_mask) {
2858 swizzled_result.writemask = one_mask;
2859 emit(MOV(swizzled_result, src_reg(1.0f)));
2860 }
2861 }
2862
2863 void
2864 vec4_visitor::visit(ir_return *)
2865 {
2866 unreachable("not reached");
2867 }
2868
2869 void
2870 vec4_visitor::visit(ir_discard *)
2871 {
2872 unreachable("not reached");
2873 }
2874
2875 void
2876 vec4_visitor::visit(ir_if *ir)
2877 {
2878 /* Don't point the annotation at the if statement, because then it plus
2879 * the then and else blocks get printed.
2880 */
2881 this->base_ir = ir->condition;
2882
2883 if (brw->gen == 6) {
2884 emit_if_gen6(ir);
2885 } else {
2886 enum brw_predicate predicate;
2887 emit_bool_to_cond_code(ir->condition, &predicate);
2888 emit(IF(predicate));
2889 }
2890
2891 visit_instructions(&ir->then_instructions);
2892
2893 if (!ir->else_instructions.is_empty()) {
2894 this->base_ir = ir->condition;
2895 emit(BRW_OPCODE_ELSE);
2896
2897 visit_instructions(&ir->else_instructions);
2898 }
2899
2900 this->base_ir = ir->condition;
2901 emit(BRW_OPCODE_ENDIF);
2902 }
2903
2904 void
2905 vec4_visitor::visit(ir_emit_vertex *)
2906 {
2907 unreachable("not reached");
2908 }
2909
2910 void
2911 vec4_visitor::visit(ir_end_primitive *)
2912 {
2913 unreachable("not reached");
2914 }
2915
2916 void
2917 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2918 dst_reg dst, src_reg offset,
2919 src_reg src0, src_reg src1)
2920 {
2921 unsigned mlen = 0;
2922
2923 /* Set the atomic operation offset. */
2924 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2925 mlen++;
2926
2927 /* Set the atomic operation arguments. */
2928 if (src0.file != BAD_FILE) {
2929 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2930 mlen++;
2931 }
2932
2933 if (src1.file != BAD_FILE) {
2934 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2935 mlen++;
2936 }
2937
2938 /* Emit the instruction. Note that this maps to the normal SIMD8
2939 * untyped atomic message on Ivy Bridge, but that's OK because
2940 * unused channels will be masked out.
2941 */
2942 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2943 src_reg(atomic_op), src_reg(surf_index));
2944 inst->base_mrf = 0;
2945 inst->mlen = mlen;
2946 }
2947
2948 void
2949 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2950 src_reg offset)
2951 {
2952 /* Set the surface read offset. */
2953 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2954
2955 /* Emit the instruction. Note that this maps to the normal SIMD8
2956 * untyped surface read message, but that's OK because unused
2957 * channels will be masked out.
2958 */
2959 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2960 dst, src_reg(surf_index));
2961 inst->base_mrf = 0;
2962 inst->mlen = 1;
2963 }
2964
2965 void
2966 vec4_visitor::emit_ndc_computation()
2967 {
2968 /* Get the position */
2969 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2970
2971 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2972 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2973 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2974
2975 current_annotation = "NDC";
2976 dst_reg ndc_w = ndc;
2977 ndc_w.writemask = WRITEMASK_W;
2978 src_reg pos_w = pos;
2979 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2980 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2981
2982 dst_reg ndc_xyz = ndc;
2983 ndc_xyz.writemask = WRITEMASK_XYZ;
2984
2985 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2986 }
2987
2988 void
2989 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2990 {
2991 if (brw->gen < 6 &&
2992 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2993 key->userclip_active || brw->has_negative_rhw_bug)) {
2994 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2995 dst_reg header1_w = header1;
2996 header1_w.writemask = WRITEMASK_W;
2997
2998 emit(MOV(header1, 0u));
2999
3000 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3001 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3002
3003 current_annotation = "Point size";
3004 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3005 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3006 }
3007
3008 if (key->userclip_active) {
3009 current_annotation = "Clipping flags";
3010 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3011 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3012
3013 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3014 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3015 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3016
3017 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3018 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3019 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3020 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3021 }
3022
3023 /* i965 clipping workaround:
3024 * 1) Test for -ve rhw
3025 * 2) If set,
3026 * set ndc = (0,0,0,0)
3027 * set ucp[6] = 1
3028 *
3029 * Later, clipping will detect ucp[6] and ensure the primitive is
3030 * clipped against all fixed planes.
3031 */
3032 if (brw->has_negative_rhw_bug) {
3033 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3034 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3035 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3036 vec4_instruction *inst;
3037 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3038 inst->predicate = BRW_PREDICATE_NORMAL;
3039 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3040 inst->predicate = BRW_PREDICATE_NORMAL;
3041 }
3042
3043 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3044 } else if (brw->gen < 6) {
3045 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3046 } else {
3047 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3048 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3049 dst_reg reg_w = reg;
3050 reg_w.writemask = WRITEMASK_W;
3051 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3052 }
3053 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3054 dst_reg reg_y = reg;
3055 reg_y.writemask = WRITEMASK_Y;
3056 reg_y.type = BRW_REGISTER_TYPE_D;
3057 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3058 }
3059 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3060 dst_reg reg_z = reg;
3061 reg_z.writemask = WRITEMASK_Z;
3062 reg_z.type = BRW_REGISTER_TYPE_D;
3063 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3064 }
3065 }
3066 }
3067
3068 void
3069 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3070 {
3071 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3072 *
3073 * "If a linked set of shaders forming the vertex stage contains no
3074 * static write to gl_ClipVertex or gl_ClipDistance, but the
3075 * application has requested clipping against user clip planes through
3076 * the API, then the coordinate written to gl_Position is used for
3077 * comparison against the user clip planes."
3078 *
3079 * This function is only called if the shader didn't write to
3080 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3081 * if the user wrote to it; otherwise we use gl_Position.
3082 */
3083 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3084 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3085 clip_vertex = VARYING_SLOT_POS;
3086 }
3087
3088 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3089 ++i) {
3090 reg.writemask = 1 << i;
3091 emit(DP4(reg,
3092 src_reg(output_reg[clip_vertex]),
3093 src_reg(this->userplane[i + offset])));
3094 }
3095 }
3096
3097 vec4_instruction *
3098 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3099 {
3100 assert (varying < VARYING_SLOT_MAX);
3101 reg.type = output_reg[varying].type;
3102 current_annotation = output_reg_annotation[varying];
3103 /* Copy the register, saturating if necessary */
3104 return emit(MOV(reg, src_reg(output_reg[varying])));
3105 }
3106
3107 void
3108 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3109 {
3110 reg.type = BRW_REGISTER_TYPE_F;
3111
3112 switch (varying) {
3113 case VARYING_SLOT_PSIZ:
3114 {
3115 /* PSIZ is always in slot 0, and is coupled with other flags. */
3116 current_annotation = "indices, point width, clip flags";
3117 emit_psiz_and_flags(reg);
3118 break;
3119 }
3120 case BRW_VARYING_SLOT_NDC:
3121 current_annotation = "NDC";
3122 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3123 break;
3124 case VARYING_SLOT_POS:
3125 current_annotation = "gl_Position";
3126 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3127 break;
3128 case VARYING_SLOT_EDGE:
3129 /* This is present when doing unfilled polygons. We're supposed to copy
3130 * the edge flag from the user-provided vertex array
3131 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3132 * of that attribute (starts as 1.0f). This is then used in clipping to
3133 * determine which edges should be drawn as wireframe.
3134 */
3135 current_annotation = "edge flag";
3136 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3137 glsl_type::float_type, WRITEMASK_XYZW))));
3138 break;
3139 case BRW_VARYING_SLOT_PAD:
3140 /* No need to write to this slot */
3141 break;
3142 case VARYING_SLOT_COL0:
3143 case VARYING_SLOT_COL1:
3144 case VARYING_SLOT_BFC0:
3145 case VARYING_SLOT_BFC1: {
3146 /* These built-in varyings are only supported in compatibility mode,
3147 * and we only support GS in core profile. So, this must be a vertex
3148 * shader.
3149 */
3150 assert(stage == MESA_SHADER_VERTEX);
3151 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3152 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3153 inst->saturate = true;
3154 break;
3155 }
3156
3157 default:
3158 emit_generic_urb_slot(reg, varying);
3159 break;
3160 }
3161 }
3162
3163 static int
3164 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3165 {
3166 if (brw->gen >= 6) {
3167 /* URB data written (does not include the message header reg) must
3168 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3169 * section 5.4.3.2.2: URB_INTERLEAVED.
3170 *
3171 * URB entries are allocated on a multiple of 1024 bits, so an
3172 * extra 128 bits written here to make the end align to 256 is
3173 * no problem.
3174 */
3175 if ((mlen % 2) != 1)
3176 mlen++;
3177 }
3178
3179 return mlen;
3180 }
3181
3182
3183 /**
3184 * Generates the VUE payload plus the necessary URB write instructions to
3185 * output it.
3186 *
3187 * The VUE layout is documented in Volume 2a.
3188 */
3189 void
3190 vec4_visitor::emit_vertex()
3191 {
3192 /* MRF 0 is reserved for the debugger, so start with message header
3193 * in MRF 1.
3194 */
3195 int base_mrf = 1;
3196 int mrf = base_mrf;
3197 /* In the process of generating our URB write message contents, we
3198 * may need to unspill a register or load from an array. Those
3199 * reads would use MRFs 14-15.
3200 */
3201 int max_usable_mrf = 13;
3202
3203 /* The following assertion verifies that max_usable_mrf causes an
3204 * even-numbered amount of URB write data, which will meet gen6's
3205 * requirements for length alignment.
3206 */
3207 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3208
3209 /* First mrf is the g0-based message header containing URB handles and
3210 * such.
3211 */
3212 emit_urb_write_header(mrf++);
3213
3214 if (brw->gen < 6) {
3215 emit_ndc_computation();
3216 }
3217
3218 /* Lower legacy ff and ClipVertex clipping to clip distances */
3219 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3220 current_annotation = "user clip distances";
3221
3222 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3223 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3224
3225 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3226 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3227 }
3228
3229 /* We may need to split this up into several URB writes, so do them in a
3230 * loop.
3231 */
3232 int slot = 0;
3233 bool complete = false;
3234 do {
3235 /* URB offset is in URB row increments, and each of our MRFs is half of
3236 * one of those, since we're doing interleaved writes.
3237 */
3238 int offset = slot / 2;
3239
3240 mrf = base_mrf + 1;
3241 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3242 emit_urb_slot(dst_reg(MRF, mrf++),
3243 prog_data->vue_map.slot_to_varying[slot]);
3244
3245 /* If this was max_usable_mrf, we can't fit anything more into this
3246 * URB WRITE.
3247 */
3248 if (mrf > max_usable_mrf) {
3249 slot++;
3250 break;
3251 }
3252 }
3253
3254 complete = slot >= prog_data->vue_map.num_slots;
3255 current_annotation = "URB write";
3256 vec4_instruction *inst = emit_urb_write_opcode(complete);
3257 inst->base_mrf = base_mrf;
3258 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3259 inst->offset += offset;
3260 } while(!complete);
3261 }
3262
3263
3264 src_reg
3265 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3266 src_reg *reladdr, int reg_offset)
3267 {
3268 /* Because we store the values to scratch interleaved like our
3269 * vertex data, we need to scale the vec4 index by 2.
3270 */
3271 int message_header_scale = 2;
3272
3273 /* Pre-gen6, the message header uses byte offsets instead of vec4
3274 * (16-byte) offset units.
3275 */
3276 if (brw->gen < 6)
3277 message_header_scale *= 16;
3278
3279 if (reladdr) {
3280 src_reg index = src_reg(this, glsl_type::int_type);
3281
3282 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3283 src_reg(reg_offset)));
3284 emit_before(block, inst, MUL(dst_reg(index), index,
3285 src_reg(message_header_scale)));
3286
3287 return index;
3288 } else {
3289 return src_reg(reg_offset * message_header_scale);
3290 }
3291 }
3292
3293 src_reg
3294 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3295 src_reg *reladdr, int reg_offset)
3296 {
3297 if (reladdr) {
3298 src_reg index = src_reg(this, glsl_type::int_type);
3299
3300 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3301 src_reg(reg_offset)));
3302
3303 /* Pre-gen6, the message header uses byte offsets instead of vec4
3304 * (16-byte) offset units.
3305 */
3306 if (brw->gen < 6) {
3307 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3308 }
3309
3310 return index;
3311 } else if (brw->gen >= 8) {
3312 /* Store the offset in a GRF so we can send-from-GRF. */
3313 src_reg offset = src_reg(this, glsl_type::int_type);
3314 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3315 return offset;
3316 } else {
3317 int message_header_scale = brw->gen < 6 ? 16 : 1;
3318 return src_reg(reg_offset * message_header_scale);
3319 }
3320 }
3321
3322 /**
3323 * Emits an instruction before @inst to load the value named by @orig_src
3324 * from scratch space at @base_offset to @temp.
3325 *
3326 * @base_offset is measured in 32-byte units (the size of a register).
3327 */
3328 void
3329 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3330 dst_reg temp, src_reg orig_src,
3331 int base_offset)
3332 {
3333 int reg_offset = base_offset + orig_src.reg_offset;
3334 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3335 reg_offset);
3336
3337 emit_before(block, inst, SCRATCH_READ(temp, index));
3338 }
3339
3340 /**
3341 * Emits an instruction after @inst to store the value to be written
3342 * to @orig_dst to scratch space at @base_offset, from @temp.
3343 *
3344 * @base_offset is measured in 32-byte units (the size of a register).
3345 */
3346 void
3347 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3348 int base_offset)
3349 {
3350 int reg_offset = base_offset + inst->dst.reg_offset;
3351 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3352 reg_offset);
3353
3354 /* Create a temporary register to store *inst's result in.
3355 *
3356 * We have to be careful in MOVing from our temporary result register in
3357 * the scratch write. If we swizzle from channels of the temporary that
3358 * weren't initialized, it will confuse live interval analysis, which will
3359 * make spilling fail to make progress.
3360 */
3361 src_reg temp = src_reg(this, glsl_type::vec4_type);
3362 temp.type = inst->dst.type;
3363 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3364 int swizzles[4];
3365 for (int i = 0; i < 4; i++)
3366 if (inst->dst.writemask & (1 << i))
3367 swizzles[i] = i;
3368 else
3369 swizzles[i] = first_writemask_chan;
3370 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3371 swizzles[2], swizzles[3]);
3372
3373 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3374 inst->dst.writemask));
3375 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3376 write->predicate = inst->predicate;
3377 write->ir = inst->ir;
3378 write->annotation = inst->annotation;
3379 inst->insert_after(block, write);
3380
3381 inst->dst.file = temp.file;
3382 inst->dst.reg = temp.reg;
3383 inst->dst.reg_offset = temp.reg_offset;
3384 inst->dst.reladdr = NULL;
3385 }
3386
3387 /**
3388 * We can't generally support array access in GRF space, because a
3389 * single instruction's destination can only span 2 contiguous
3390 * registers. So, we send all GRF arrays that get variable index
3391 * access to scratch space.
3392 */
3393 void
3394 vec4_visitor::move_grf_array_access_to_scratch()
3395 {
3396 int scratch_loc[this->alloc.count];
3397 memset(scratch_loc, -1, sizeof(scratch_loc));
3398
3399 /* First, calculate the set of virtual GRFs that need to be punted
3400 * to scratch due to having any array access on them, and where in
3401 * scratch.
3402 */
3403 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3404 if (inst->dst.file == GRF && inst->dst.reladdr &&
3405 scratch_loc[inst->dst.reg] == -1) {
3406 scratch_loc[inst->dst.reg] = c->last_scratch;
3407 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3408 }
3409
3410 for (int i = 0 ; i < 3; i++) {
3411 src_reg *src = &inst->src[i];
3412
3413 if (src->file == GRF && src->reladdr &&
3414 scratch_loc[src->reg] == -1) {
3415 scratch_loc[src->reg] = c->last_scratch;
3416 c->last_scratch += this->alloc.sizes[src->reg];
3417 }
3418 }
3419 }
3420
3421 /* Now, for anything that will be accessed through scratch, rewrite
3422 * it to load/store. Note that this is a _safe list walk, because
3423 * we may generate a new scratch_write instruction after the one
3424 * we're processing.
3425 */
3426 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3427 /* Set up the annotation tracking for new generated instructions. */
3428 base_ir = inst->ir;
3429 current_annotation = inst->annotation;
3430
3431 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3432 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3433 }
3434
3435 for (int i = 0 ; i < 3; i++) {
3436 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3437 continue;
3438
3439 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3440
3441 emit_scratch_read(block, inst, temp, inst->src[i],
3442 scratch_loc[inst->src[i].reg]);
3443
3444 inst->src[i].file = temp.file;
3445 inst->src[i].reg = temp.reg;
3446 inst->src[i].reg_offset = temp.reg_offset;
3447 inst->src[i].reladdr = NULL;
3448 }
3449 }
3450 }
3451
3452 /**
3453 * Emits an instruction before @inst to load the value named by @orig_src
3454 * from the pull constant buffer (surface) at @base_offset to @temp.
3455 */
3456 void
3457 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3458 dst_reg temp, src_reg orig_src,
3459 int base_offset)
3460 {
3461 int reg_offset = base_offset + orig_src.reg_offset;
3462 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3463 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3464 reg_offset);
3465 vec4_instruction *load;
3466
3467 if (brw->gen >= 7) {
3468 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3469 grf_offset.type = offset.type;
3470 emit_before(block, inst, MOV(grf_offset, offset));
3471
3472 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3473 temp, index, src_reg(grf_offset));
3474 load->mlen = 1;
3475 } else {
3476 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3477 temp, index, offset);
3478 load->base_mrf = 14;
3479 load->mlen = 1;
3480 }
3481 emit_before(block, inst, load);
3482 }
3483
3484 /**
3485 * Implements array access of uniforms by inserting a
3486 * PULL_CONSTANT_LOAD instruction.
3487 *
3488 * Unlike temporary GRF array access (where we don't support it due to
3489 * the difficulty of doing relative addressing on instruction
3490 * destinations), we could potentially do array access of uniforms
3491 * that were loaded in GRF space as push constants. In real-world
3492 * usage we've seen, though, the arrays being used are always larger
3493 * than we could load as push constants, so just always move all
3494 * uniform array access out to a pull constant buffer.
3495 */
3496 void
3497 vec4_visitor::move_uniform_array_access_to_pull_constants()
3498 {
3499 int pull_constant_loc[this->uniforms];
3500 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3501 bool nested_reladdr;
3502
3503 /* Walk through and find array access of uniforms. Put a copy of that
3504 * uniform in the pull constant buffer.
3505 *
3506 * Note that we don't move constant-indexed accesses to arrays. No
3507 * testing has been done of the performance impact of this choice.
3508 */
3509 do {
3510 nested_reladdr = false;
3511
3512 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3513 for (int i = 0 ; i < 3; i++) {
3514 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3515 continue;
3516
3517 int uniform = inst->src[i].reg;
3518
3519 if (inst->src[i].reladdr->reladdr)
3520 nested_reladdr = true; /* will need another pass */
3521
3522 /* If this array isn't already present in the pull constant buffer,
3523 * add it.
3524 */
3525 if (pull_constant_loc[uniform] == -1) {
3526 const gl_constant_value **values =
3527 &stage_prog_data->param[uniform * 4];
3528
3529 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3530
3531 assert(uniform < uniform_array_size);
3532 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3533 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3534 = values[j];
3535 }
3536 }
3537
3538 /* Set up the annotation tracking for new generated instructions. */
3539 base_ir = inst->ir;
3540 current_annotation = inst->annotation;
3541
3542 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3543
3544 emit_pull_constant_load(block, inst, temp, inst->src[i],
3545 pull_constant_loc[uniform]);
3546
3547 inst->src[i].file = temp.file;
3548 inst->src[i].reg = temp.reg;
3549 inst->src[i].reg_offset = temp.reg_offset;
3550 inst->src[i].reladdr = NULL;
3551 }
3552 }
3553 } while (nested_reladdr);
3554
3555 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3556 * no need to track them as larger-than-vec4 objects. This will be
3557 * relied on in cutting out unused uniform vectors from push
3558 * constants.
3559 */
3560 split_uniform_registers();
3561 }
3562
3563 void
3564 vec4_visitor::resolve_ud_negate(src_reg *reg)
3565 {
3566 if (reg->type != BRW_REGISTER_TYPE_UD ||
3567 !reg->negate)
3568 return;
3569
3570 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3571 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3572 *reg = temp;
3573 }
3574
3575 /**
3576 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3577 *
3578 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3579 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3580 */
3581 void
3582 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3583 {
3584 assert(brw->gen <= 5);
3585
3586 if (!rvalue->type->is_boolean())
3587 return;
3588
3589 src_reg and_result = src_reg(this, rvalue->type);
3590 src_reg neg_result = src_reg(this, rvalue->type);
3591 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3592 emit(MOV(dst_reg(neg_result), negate(and_result)));
3593 *reg = neg_result;
3594 }
3595
3596 vec4_visitor::vec4_visitor(struct brw_context *brw,
3597 struct brw_vec4_compile *c,
3598 struct gl_program *prog,
3599 const struct brw_vue_prog_key *key,
3600 struct brw_vue_prog_data *prog_data,
3601 struct gl_shader_program *shader_prog,
3602 gl_shader_stage stage,
3603 void *mem_ctx,
3604 bool no_spills,
3605 shader_time_shader_type st_base,
3606 shader_time_shader_type st_written,
3607 shader_time_shader_type st_reset)
3608 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3609 c(c),
3610 key(key),
3611 prog_data(prog_data),
3612 sanity_param_count(0),
3613 fail_msg(NULL),
3614 first_non_payload_grf(0),
3615 need_all_constants_in_pull_buffer(false),
3616 no_spills(no_spills),
3617 st_base(st_base),
3618 st_written(st_written),
3619 st_reset(st_reset)
3620 {
3621 this->mem_ctx = mem_ctx;
3622 this->failed = false;
3623
3624 this->base_ir = NULL;
3625 this->current_annotation = NULL;
3626 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3627
3628 this->variable_ht = hash_table_ctor(0,
3629 hash_table_pointer_hash,
3630 hash_table_pointer_compare);
3631
3632 this->virtual_grf_start = NULL;
3633 this->virtual_grf_end = NULL;
3634 this->live_intervals = NULL;
3635
3636 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3637
3638 this->uniforms = 0;
3639
3640 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3641 * at least one. See setup_uniforms() in brw_vec4.cpp.
3642 */
3643 this->uniform_array_size = 1;
3644 if (prog_data) {
3645 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3646 }
3647
3648 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3649 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3650 }
3651
3652 vec4_visitor::~vec4_visitor()
3653 {
3654 hash_table_dtor(this->variable_ht);
3655 }
3656
3657
3658 void
3659 vec4_visitor::fail(const char *format, ...)
3660 {
3661 va_list va;
3662 char *msg;
3663
3664 if (failed)
3665 return;
3666
3667 failed = true;
3668
3669 va_start(va, format);
3670 msg = ralloc_vasprintf(mem_ctx, format, va);
3671 va_end(va);
3672 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3673
3674 this->fail_msg = msg;
3675
3676 if (debug_enabled) {
3677 fprintf(stderr, "%s", msg);
3678 }
3679 }
3680
3681 } /* namespace brw */