i965: Handle scratch accesses where reladdr also points to scratch space
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(brw->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (brw->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (brw->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (brw->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_DOUBLE:
616 case GLSL_TYPE_ERROR:
617 case GLSL_TYPE_INTERFACE:
618 unreachable("not reached");
619 }
620
621 return 0;
622 }
623
624 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
625 {
626 init();
627
628 this->file = GRF;
629 this->reg = v->alloc.allocate(type_size(type));
630
631 if (type->is_array() || type->is_record()) {
632 this->swizzle = BRW_SWIZZLE_NOOP;
633 } else {
634 this->swizzle = brw_swizzle_for_size(type->vector_elements);
635 }
636
637 this->type = brw_type_for_base_type(type);
638 }
639
640 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
641 {
642 assert(size > 0);
643
644 init();
645
646 this->file = GRF;
647 this->reg = v->alloc.allocate(type_size(type) * size);
648
649 this->swizzle = BRW_SWIZZLE_NOOP;
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->writemask = WRITEMASK_XYZW;
663 } else {
664 this->writemask = (1 << type->vector_elements) - 1;
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 /* Our support for uniforms is piggy-backed on the struct
671 * gl_fragment_program, because that's where the values actually
672 * get stored, rather than in some global gl_shader_program uniform
673 * store.
674 */
675 void
676 vec4_visitor::setup_uniform_values(ir_variable *ir)
677 {
678 int namelen = strlen(ir->name);
679
680 /* The data for our (non-builtin) uniforms is stored in a series of
681 * gl_uniform_driver_storage structs for each subcomponent that
682 * glGetUniformLocation() could name. We know it's been set up in the same
683 * order we'd walk the type, so walk the list of storage and find anything
684 * with our name, or the prefix of a component that starts with our name.
685 */
686 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
687 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
688
689 if (strncmp(ir->name, storage->name, namelen) != 0 ||
690 (storage->name[namelen] != 0 &&
691 storage->name[namelen] != '.' &&
692 storage->name[namelen] != '[')) {
693 continue;
694 }
695
696 gl_constant_value *components = storage->storage;
697 unsigned vector_count = (MAX2(storage->array_elements, 1) *
698 storage->type->matrix_columns);
699
700 for (unsigned s = 0; s < vector_count; s++) {
701 assert(uniforms < uniform_array_size);
702 uniform_vector_size[uniforms] = storage->type->vector_elements;
703
704 int i;
705 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
706 stage_prog_data->param[uniforms * 4 + i] = components;
707 components++;
708 }
709 for (; i < 4; i++) {
710 static gl_constant_value zero = { 0.0 };
711 stage_prog_data->param[uniforms * 4 + i] = &zero;
712 }
713
714 uniforms++;
715 }
716 }
717 }
718
719 void
720 vec4_visitor::setup_uniform_clipplane_values()
721 {
722 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
723
724 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 4;
727 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
728 this->userplane[i].type = BRW_REGISTER_TYPE_F;
729 for (int j = 0; j < 4; ++j) {
730 stage_prog_data->param[this->uniforms * 4 + j] =
731 (gl_constant_value *) &clip_planes[i][j];
732 }
733 ++this->uniforms;
734 }
735 }
736
737 /* Our support for builtin uniforms is even scarier than non-builtin.
738 * It sits on top of the PROG_STATE_VAR parameters that are
739 * automatically updated from GL context state.
740 */
741 void
742 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
743 {
744 const ir_state_slot *const slots = ir->get_state_slots();
745 assert(slots != NULL);
746
747 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
748 /* This state reference has already been setup by ir_to_mesa,
749 * but we'll get the same index back here. We can reference
750 * ParameterValues directly, since unlike brw_fs.cpp, we never
751 * add new state references during compile.
752 */
753 int index = _mesa_add_state_reference(this->prog->Parameters,
754 (gl_state_index *)slots[i].tokens);
755 gl_constant_value *values =
756 &this->prog->Parameters->ParameterValues[index][0];
757
758 assert(this->uniforms < uniform_array_size);
759
760 for (unsigned j = 0; j < 4; j++)
761 stage_prog_data->param[this->uniforms * 4 + j] =
762 &values[GET_SWZ(slots[i].swizzle, j)];
763
764 this->uniform_vector_size[this->uniforms] =
765 (ir->type->is_scalar() || ir->type->is_vector() ||
766 ir->type->is_matrix() ? ir->type->vector_elements : 4);
767
768 this->uniforms++;
769 }
770 }
771
772 dst_reg *
773 vec4_visitor::variable_storage(ir_variable *var)
774 {
775 return (dst_reg *)hash_table_find(this->variable_ht, var);
776 }
777
778 void
779 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
780 enum brw_predicate *predicate)
781 {
782 ir_expression *expr = ir->as_expression();
783
784 *predicate = BRW_PREDICATE_NORMAL;
785
786 if (expr && expr->operation != ir_binop_ubo_load) {
787 src_reg op[3];
788 vec4_instruction *inst;
789
790 assert(expr->get_num_operands() <= 3);
791 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
792 expr->operands[i]->accept(this);
793 op[i] = this->result;
794
795 resolve_ud_negate(&op[i]);
796 }
797
798 switch (expr->operation) {
799 case ir_unop_logic_not:
800 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
801 inst->conditional_mod = BRW_CONDITIONAL_Z;
802 break;
803
804 case ir_binop_logic_xor:
805 if (brw->gen <= 5) {
806 src_reg temp = src_reg(this, ir->type);
807 emit(XOR(dst_reg(temp), op[0], op[1]));
808 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
809 } else {
810 inst = emit(XOR(dst_null_d(), op[0], op[1]));
811 }
812 inst->conditional_mod = BRW_CONDITIONAL_NZ;
813 break;
814
815 case ir_binop_logic_or:
816 if (brw->gen <= 5) {
817 src_reg temp = src_reg(this, ir->type);
818 emit(OR(dst_reg(temp), op[0], op[1]));
819 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
820 } else {
821 inst = emit(OR(dst_null_d(), op[0], op[1]));
822 }
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_binop_logic_and:
827 if (brw->gen <= 5) {
828 src_reg temp = src_reg(this, ir->type);
829 emit(AND(dst_reg(temp), op[0], op[1]));
830 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
831 } else {
832 inst = emit(AND(dst_null_d(), op[0], op[1]));
833 }
834 inst->conditional_mod = BRW_CONDITIONAL_NZ;
835 break;
836
837 case ir_unop_f2b:
838 if (brw->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_f(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_unop_i2b:
847 if (brw->gen >= 6) {
848 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
849 } else {
850 inst = emit(MOV(dst_null_d(), op[0]));
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 }
853 break;
854
855 case ir_binop_all_equal:
856 if (brw->gen <= 5) {
857 resolve_bool_comparison(expr->operands[0], &op[0]);
858 resolve_bool_comparison(expr->operands[1], &op[1]);
859 }
860 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
861 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
862 break;
863
864 case ir_binop_any_nequal:
865 if (brw->gen <= 5) {
866 resolve_bool_comparison(expr->operands[0], &op[0]);
867 resolve_bool_comparison(expr->operands[1], &op[1]);
868 }
869 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
870 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
871 break;
872
873 case ir_unop_any:
874 if (brw->gen <= 5) {
875 resolve_bool_comparison(expr->operands[0], &op[0]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
878 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
879 break;
880
881 case ir_binop_greater:
882 case ir_binop_gequal:
883 case ir_binop_less:
884 case ir_binop_lequal:
885 case ir_binop_equal:
886 case ir_binop_nequal:
887 if (brw->gen <= 5) {
888 resolve_bool_comparison(expr->operands[0], &op[0]);
889 resolve_bool_comparison(expr->operands[1], &op[1]);
890 }
891 emit(CMP(dst_null_d(), op[0], op[1],
892 brw_conditional_for_comparison(expr->operation)));
893 break;
894
895 case ir_triop_csel: {
896 /* Expand the boolean condition into the flag register. */
897 inst = emit(MOV(dst_null_d(), op[0]));
898 inst->conditional_mod = BRW_CONDITIONAL_NZ;
899
900 /* Select which boolean to return. */
901 dst_reg temp(this, expr->operands[1]->type);
902 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
903 inst->predicate = BRW_PREDICATE_NORMAL;
904
905 /* Expand the result to a condition code. */
906 inst = emit(MOV(dst_null_d(), src_reg(temp)));
907 inst->conditional_mod = BRW_CONDITIONAL_NZ;
908 break;
909 }
910
911 default:
912 unreachable("not reached");
913 }
914 return;
915 }
916
917 ir->accept(this);
918
919 resolve_ud_negate(&this->result);
920
921 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923 }
924
925 /**
926 * Emit a gen6 IF statement with the comparison folded into the IF
927 * instruction.
928 */
929 void
930 vec4_visitor::emit_if_gen6(ir_if *ir)
931 {
932 ir_expression *expr = ir->condition->as_expression();
933
934 if (expr && expr->operation != ir_binop_ubo_load) {
935 src_reg op[3];
936 dst_reg temp;
937
938 assert(expr->get_num_operands() <= 3);
939 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
940 expr->operands[i]->accept(this);
941 op[i] = this->result;
942 }
943
944 switch (expr->operation) {
945 case ir_unop_logic_not:
946 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
947 return;
948
949 case ir_binop_logic_xor:
950 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
951 return;
952
953 case ir_binop_logic_or:
954 temp = dst_reg(this, glsl_type::bool_type);
955 emit(OR(temp, op[0], op[1]));
956 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958
959 case ir_binop_logic_and:
960 temp = dst_reg(this, glsl_type::bool_type);
961 emit(AND(temp, op[0], op[1]));
962 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
963 return;
964
965 case ir_unop_f2b:
966 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_i2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_binop_greater:
974 case ir_binop_gequal:
975 case ir_binop_less:
976 case ir_binop_lequal:
977 case ir_binop_equal:
978 case ir_binop_nequal:
979 emit(IF(op[0], op[1],
980 brw_conditional_for_comparison(expr->operation)));
981 return;
982
983 case ir_binop_all_equal:
984 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
985 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
986 return;
987
988 case ir_binop_any_nequal:
989 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
990 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
991 return;
992
993 case ir_unop_any:
994 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
995 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
996 return;
997
998 case ir_triop_csel: {
999 /* Expand the boolean condition into the flag register. */
1000 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1001 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1002
1003 /* Select which boolean to return. */
1004 dst_reg temp(this, expr->operands[1]->type);
1005 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1006 inst->predicate = BRW_PREDICATE_NORMAL;
1007
1008 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1009 return;
1010 }
1011
1012 default:
1013 unreachable("not reached");
1014 }
1015 return;
1016 }
1017
1018 ir->condition->accept(this);
1019
1020 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_variable *ir)
1025 {
1026 dst_reg *reg = NULL;
1027
1028 if (variable_storage(ir))
1029 return;
1030
1031 switch (ir->data.mode) {
1032 case ir_var_shader_in:
1033 assert(ir->data.location != -1);
1034 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1035 break;
1036
1037 case ir_var_shader_out:
1038 assert(ir->data.location != -1);
1039 reg = new(mem_ctx) dst_reg(this, ir->type);
1040
1041 for (int i = 0; i < type_size(ir->type); i++) {
1042 output_reg[ir->data.location + i] = *reg;
1043 output_reg[ir->data.location + i].reg_offset = i;
1044 output_reg[ir->data.location + i].type =
1045 brw_type_for_base_type(ir->type->get_scalar_type());
1046 output_reg_annotation[ir->data.location + i] = ir->name;
1047 }
1048 break;
1049
1050 case ir_var_auto:
1051 case ir_var_temporary:
1052 reg = new(mem_ctx) dst_reg(this, ir->type);
1053 break;
1054
1055 case ir_var_uniform:
1056 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1057
1058 /* Thanks to the lower_ubo_reference pass, we will see only
1059 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1060 * variables, so no need for them to be in variable_ht.
1061 *
1062 * Some uniforms, such as samplers and atomic counters, have no actual
1063 * storage, so we should ignore them.
1064 */
1065 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1066 return;
1067
1068 /* Track how big the whole uniform variable is, in case we need to put a
1069 * copy of its data into pull constants for array access.
1070 */
1071 assert(this->uniforms < uniform_array_size);
1072 this->uniform_size[this->uniforms] = type_size(ir->type);
1073
1074 if (!strncmp(ir->name, "gl_", 3)) {
1075 setup_builtin_uniform_values(ir);
1076 } else {
1077 setup_uniform_values(ir);
1078 }
1079 break;
1080
1081 case ir_var_system_value:
1082 reg = make_reg_for_system_value(ir);
1083 break;
1084
1085 default:
1086 unreachable("not reached");
1087 }
1088
1089 reg->type = brw_type_for_base_type(ir->type);
1090 hash_table_insert(this->variable_ht, reg, ir);
1091 }
1092
1093 void
1094 vec4_visitor::visit(ir_loop *ir)
1095 {
1096 /* We don't want debugging output to print the whole body of the
1097 * loop as the annotation.
1098 */
1099 this->base_ir = NULL;
1100
1101 emit(BRW_OPCODE_DO);
1102
1103 visit_instructions(&ir->body_instructions);
1104
1105 emit(BRW_OPCODE_WHILE);
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_loop_jump *ir)
1110 {
1111 switch (ir->mode) {
1112 case ir_loop_jump::jump_break:
1113 emit(BRW_OPCODE_BREAK);
1114 break;
1115 case ir_loop_jump::jump_continue:
1116 emit(BRW_OPCODE_CONTINUE);
1117 break;
1118 }
1119 }
1120
1121
1122 void
1123 vec4_visitor::visit(ir_function_signature *)
1124 {
1125 unreachable("not reached");
1126 }
1127
1128 void
1129 vec4_visitor::visit(ir_function *ir)
1130 {
1131 /* Ignore function bodies other than main() -- we shouldn't see calls to
1132 * them since they should all be inlined.
1133 */
1134 if (strcmp(ir->name, "main") == 0) {
1135 const ir_function_signature *sig;
1136 exec_list empty;
1137
1138 sig = ir->matching_signature(NULL, &empty, false);
1139
1140 assert(sig);
1141
1142 visit_instructions(&sig->body);
1143 }
1144 }
1145
1146 bool
1147 vec4_visitor::try_emit_mad(ir_expression *ir)
1148 {
1149 /* 3-src instructions were introduced in gen6. */
1150 if (brw->gen < 6)
1151 return false;
1152
1153 /* MAD can only handle floating-point data. */
1154 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1155 return false;
1156
1157 ir_rvalue *nonmul;
1158 ir_expression *mul;
1159 bool mul_negate, mul_abs;
1160
1161 for (int i = 0; i < 2; i++) {
1162 mul_negate = false;
1163 mul_abs = false;
1164
1165 mul = ir->operands[i]->as_expression();
1166 nonmul = ir->operands[1 - i];
1167
1168 if (mul && mul->operation == ir_unop_abs) {
1169 mul = mul->operands[0]->as_expression();
1170 mul_abs = true;
1171 } else if (mul && mul->operation == ir_unop_neg) {
1172 mul = mul->operands[0]->as_expression();
1173 mul_negate = true;
1174 }
1175
1176 if (mul && mul->operation == ir_binop_mul)
1177 break;
1178 }
1179
1180 if (!mul || mul->operation != ir_binop_mul)
1181 return false;
1182
1183 nonmul->accept(this);
1184 src_reg src0 = fix_3src_operand(this->result);
1185
1186 mul->operands[0]->accept(this);
1187 src_reg src1 = fix_3src_operand(this->result);
1188 src1.negate ^= mul_negate;
1189 src1.abs = mul_abs;
1190 if (mul_abs)
1191 src1.negate = false;
1192
1193 mul->operands[1]->accept(this);
1194 src_reg src2 = fix_3src_operand(this->result);
1195 src2.abs = mul_abs;
1196 if (mul_abs)
1197 src2.negate = false;
1198
1199 this->result = src_reg(this, ir->type);
1200 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1201
1202 return true;
1203 }
1204
1205 bool
1206 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1207 {
1208 /* This optimization relies on CMP setting the destination to 0 when
1209 * false. Early hardware only sets the least significant bit, and
1210 * leaves the other bits undefined. So we can't use it.
1211 */
1212 if (brw->gen < 6)
1213 return false;
1214
1215 ir_expression *const cmp = ir->operands[0]->as_expression();
1216
1217 if (cmp == NULL)
1218 return false;
1219
1220 switch (cmp->operation) {
1221 case ir_binop_less:
1222 case ir_binop_greater:
1223 case ir_binop_lequal:
1224 case ir_binop_gequal:
1225 case ir_binop_equal:
1226 case ir_binop_nequal:
1227 break;
1228
1229 default:
1230 return false;
1231 }
1232
1233 cmp->operands[0]->accept(this);
1234 const src_reg cmp_src0 = this->result;
1235
1236 cmp->operands[1]->accept(this);
1237 const src_reg cmp_src1 = this->result;
1238
1239 this->result = src_reg(this, ir->type);
1240
1241 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1242 brw_conditional_for_comparison(cmp->operation)));
1243
1244 /* If the comparison is false, this->result will just happen to be zero.
1245 */
1246 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1247 this->result, src_reg(1.0f));
1248 inst->predicate = BRW_PREDICATE_NORMAL;
1249 inst->predicate_inverse = true;
1250
1251 return true;
1252 }
1253
1254 void
1255 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1256 src_reg src0, src_reg src1)
1257 {
1258 vec4_instruction *inst;
1259
1260 if (brw->gen >= 6) {
1261 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262 inst->conditional_mod = conditionalmod;
1263 } else {
1264 emit(CMP(dst, src0, src1, conditionalmod));
1265
1266 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1267 inst->predicate = BRW_PREDICATE_NORMAL;
1268 }
1269 }
1270
1271 void
1272 vec4_visitor::emit_lrp(const dst_reg &dst,
1273 const src_reg &x, const src_reg &y, const src_reg &a)
1274 {
1275 if (brw->gen >= 6) {
1276 /* Note that the instruction's argument order is reversed from GLSL
1277 * and the IR.
1278 */
1279 emit(LRP(dst,
1280 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1281 } else {
1282 /* Earlier generations don't support three source operations, so we
1283 * need to emit x*(1-a) + y*a.
1284 */
1285 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1286 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1287 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1288 y_times_a.writemask = dst.writemask;
1289 one_minus_a.writemask = dst.writemask;
1290 x_times_one_minus_a.writemask = dst.writemask;
1291
1292 emit(MUL(y_times_a, y, a));
1293 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1294 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1295 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1296 }
1297 }
1298
1299 void
1300 vec4_visitor::visit(ir_expression *ir)
1301 {
1302 unsigned int operand;
1303 src_reg op[ARRAY_SIZE(ir->operands)];
1304 vec4_instruction *inst;
1305
1306 if (ir->operation == ir_binop_add) {
1307 if (try_emit_mad(ir))
1308 return;
1309 }
1310
1311 if (ir->operation == ir_unop_b2f) {
1312 if (try_emit_b2f_of_compare(ir))
1313 return;
1314 }
1315
1316 /* Storage for our result. Ideally for an assignment we'd be using
1317 * the actual storage for the result here, instead.
1318 */
1319 dst_reg result_dst(this, ir->type);
1320 src_reg result_src(result_dst);
1321
1322 if (ir->operation == ir_triop_csel) {
1323 ir->operands[1]->accept(this);
1324 op[1] = this->result;
1325 ir->operands[2]->accept(this);
1326 op[2] = this->result;
1327
1328 enum brw_predicate predicate;
1329 emit_bool_to_cond_code(ir->operands[0], &predicate);
1330 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1331 inst->predicate = predicate;
1332 this->result = result_src;
1333 return;
1334 }
1335
1336 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1337 this->result.file = BAD_FILE;
1338 ir->operands[operand]->accept(this);
1339 if (this->result.file == BAD_FILE) {
1340 fprintf(stderr, "Failed to get tree for expression operand:\n");
1341 ir->operands[operand]->fprint(stderr);
1342 exit(1);
1343 }
1344 op[operand] = this->result;
1345
1346 /* Matrix expression operands should have been broken down to vector
1347 * operations already.
1348 */
1349 assert(!ir->operands[operand]->type->is_matrix());
1350 }
1351
1352 /* If nothing special happens, this is the result. */
1353 this->result = result_src;
1354
1355 switch (ir->operation) {
1356 case ir_unop_logic_not:
1357 emit(NOT(result_dst, op[0]));
1358 break;
1359 case ir_unop_neg:
1360 op[0].negate = !op[0].negate;
1361 emit(MOV(result_dst, op[0]));
1362 break;
1363 case ir_unop_abs:
1364 op[0].abs = true;
1365 op[0].negate = false;
1366 emit(MOV(result_dst, op[0]));
1367 break;
1368
1369 case ir_unop_sign:
1370 if (ir->type->is_float()) {
1371 /* AND(val, 0x80000000) gives the sign bit.
1372 *
1373 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1374 * zero.
1375 */
1376 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1377
1378 op[0].type = BRW_REGISTER_TYPE_UD;
1379 result_dst.type = BRW_REGISTER_TYPE_UD;
1380 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1381
1382 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1383 inst->predicate = BRW_PREDICATE_NORMAL;
1384
1385 this->result.type = BRW_REGISTER_TYPE_F;
1386 } else {
1387 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1388 * -> non-negative val generates 0x00000000.
1389 * Predicated OR sets 1 if val is positive.
1390 */
1391 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1392
1393 emit(ASR(result_dst, op[0], src_reg(31)));
1394
1395 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1396 inst->predicate = BRW_PREDICATE_NORMAL;
1397 }
1398 break;
1399
1400 case ir_unop_rcp:
1401 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1402 break;
1403
1404 case ir_unop_exp2:
1405 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1406 break;
1407 case ir_unop_log2:
1408 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1409 break;
1410 case ir_unop_exp:
1411 case ir_unop_log:
1412 unreachable("not reached: should be handled by ir_explog_to_explog2");
1413 case ir_unop_sin:
1414 case ir_unop_sin_reduced:
1415 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1416 break;
1417 case ir_unop_cos:
1418 case ir_unop_cos_reduced:
1419 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1420 break;
1421
1422 case ir_unop_dFdx:
1423 case ir_unop_dFdx_coarse:
1424 case ir_unop_dFdx_fine:
1425 case ir_unop_dFdy:
1426 case ir_unop_dFdy_coarse:
1427 case ir_unop_dFdy_fine:
1428 unreachable("derivatives not valid in vertex shader");
1429
1430 case ir_unop_bitfield_reverse:
1431 emit(BFREV(result_dst, op[0]));
1432 break;
1433 case ir_unop_bit_count:
1434 emit(CBIT(result_dst, op[0]));
1435 break;
1436 case ir_unop_find_msb: {
1437 src_reg temp = src_reg(this, glsl_type::uint_type);
1438
1439 inst = emit(FBH(dst_reg(temp), op[0]));
1440 inst->dst.writemask = WRITEMASK_XYZW;
1441
1442 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1443 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1444 * subtract the result from 31 to convert the MSB count into an LSB count.
1445 */
1446
1447 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1448 temp.swizzle = BRW_SWIZZLE_NOOP;
1449 emit(MOV(result_dst, temp));
1450
1451 src_reg src_tmp = src_reg(result_dst);
1452 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1453
1454 src_tmp.negate = true;
1455 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1456 inst->predicate = BRW_PREDICATE_NORMAL;
1457 break;
1458 }
1459 case ir_unop_find_lsb:
1460 emit(FBL(result_dst, op[0]));
1461 break;
1462 case ir_unop_saturate:
1463 inst = emit(MOV(result_dst, op[0]));
1464 inst->saturate = true;
1465 break;
1466
1467 case ir_unop_noise:
1468 unreachable("not reached: should be handled by lower_noise");
1469
1470 case ir_binop_add:
1471 emit(ADD(result_dst, op[0], op[1]));
1472 break;
1473 case ir_binop_sub:
1474 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1475
1476 case ir_binop_mul:
1477 if (brw->gen < 8 && ir->type->is_integer()) {
1478 /* For integer multiplication, the MUL uses the low 16 bits of one of
1479 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1480 * accumulates in the contribution of the upper 16 bits of that
1481 * operand. If we can determine that one of the args is in the low
1482 * 16 bits, though, we can just emit a single MUL.
1483 */
1484 if (ir->operands[0]->is_uint16_constant()) {
1485 if (brw->gen < 7)
1486 emit(MUL(result_dst, op[0], op[1]));
1487 else
1488 emit(MUL(result_dst, op[1], op[0]));
1489 } else if (ir->operands[1]->is_uint16_constant()) {
1490 if (brw->gen < 7)
1491 emit(MUL(result_dst, op[1], op[0]));
1492 else
1493 emit(MUL(result_dst, op[0], op[1]));
1494 } else {
1495 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1496
1497 emit(MUL(acc, op[0], op[1]));
1498 emit(MACH(dst_null_d(), op[0], op[1]));
1499 emit(MOV(result_dst, src_reg(acc)));
1500 }
1501 } else {
1502 emit(MUL(result_dst, op[0], op[1]));
1503 }
1504 break;
1505 case ir_binop_imul_high: {
1506 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1507
1508 emit(MUL(acc, op[0], op[1]));
1509 emit(MACH(result_dst, op[0], op[1]));
1510 break;
1511 }
1512 case ir_binop_div:
1513 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1514 assert(ir->type->is_integer());
1515 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1516 break;
1517 case ir_binop_carry: {
1518 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1519
1520 emit(ADDC(dst_null_ud(), op[0], op[1]));
1521 emit(MOV(result_dst, src_reg(acc)));
1522 break;
1523 }
1524 case ir_binop_borrow: {
1525 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1526
1527 emit(SUBB(dst_null_ud(), op[0], op[1]));
1528 emit(MOV(result_dst, src_reg(acc)));
1529 break;
1530 }
1531 case ir_binop_mod:
1532 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1533 assert(ir->type->is_integer());
1534 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1535 break;
1536
1537 case ir_binop_less:
1538 case ir_binop_greater:
1539 case ir_binop_lequal:
1540 case ir_binop_gequal:
1541 case ir_binop_equal:
1542 case ir_binop_nequal: {
1543 if (brw->gen <= 5) {
1544 resolve_bool_comparison(ir->operands[0], &op[0]);
1545 resolve_bool_comparison(ir->operands[1], &op[1]);
1546 }
1547 emit(CMP(result_dst, op[0], op[1],
1548 brw_conditional_for_comparison(ir->operation)));
1549 break;
1550 }
1551
1552 case ir_binop_all_equal:
1553 if (brw->gen <= 5) {
1554 resolve_bool_comparison(ir->operands[0], &op[0]);
1555 resolve_bool_comparison(ir->operands[1], &op[1]);
1556 }
1557
1558 /* "==" operator producing a scalar boolean. */
1559 if (ir->operands[0]->type->is_vector() ||
1560 ir->operands[1]->type->is_vector()) {
1561 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1562 emit(MOV(result_dst, src_reg(0)));
1563 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1564 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1565 } else {
1566 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1567 }
1568 break;
1569 case ir_binop_any_nequal:
1570 if (brw->gen <= 5) {
1571 resolve_bool_comparison(ir->operands[0], &op[0]);
1572 resolve_bool_comparison(ir->operands[1], &op[1]);
1573 }
1574
1575 /* "!=" operator producing a scalar boolean. */
1576 if (ir->operands[0]->type->is_vector() ||
1577 ir->operands[1]->type->is_vector()) {
1578 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1579
1580 emit(MOV(result_dst, src_reg(0)));
1581 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1582 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1583 } else {
1584 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1585 }
1586 break;
1587
1588 case ir_unop_any:
1589 if (brw->gen <= 5) {
1590 resolve_bool_comparison(ir->operands[0], &op[0]);
1591 }
1592 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1593 emit(MOV(result_dst, src_reg(0)));
1594
1595 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1596 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1597 break;
1598
1599 case ir_binop_logic_xor:
1600 emit(XOR(result_dst, op[0], op[1]));
1601 break;
1602
1603 case ir_binop_logic_or:
1604 emit(OR(result_dst, op[0], op[1]));
1605 break;
1606
1607 case ir_binop_logic_and:
1608 emit(AND(result_dst, op[0], op[1]));
1609 break;
1610
1611 case ir_binop_dot:
1612 assert(ir->operands[0]->type->is_vector());
1613 assert(ir->operands[0]->type == ir->operands[1]->type);
1614 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1615 break;
1616
1617 case ir_unop_sqrt:
1618 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1619 break;
1620 case ir_unop_rsq:
1621 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1622 break;
1623
1624 case ir_unop_bitcast_i2f:
1625 case ir_unop_bitcast_u2f:
1626 this->result = op[0];
1627 this->result.type = BRW_REGISTER_TYPE_F;
1628 break;
1629
1630 case ir_unop_bitcast_f2i:
1631 this->result = op[0];
1632 this->result.type = BRW_REGISTER_TYPE_D;
1633 break;
1634
1635 case ir_unop_bitcast_f2u:
1636 this->result = op[0];
1637 this->result.type = BRW_REGISTER_TYPE_UD;
1638 break;
1639
1640 case ir_unop_i2f:
1641 case ir_unop_i2u:
1642 case ir_unop_u2i:
1643 case ir_unop_u2f:
1644 case ir_unop_f2i:
1645 case ir_unop_f2u:
1646 emit(MOV(result_dst, op[0]));
1647 break;
1648 case ir_unop_b2i:
1649 emit(AND(result_dst, op[0], src_reg(1)));
1650 break;
1651 case ir_unop_b2f:
1652 if (brw->gen <= 5) {
1653 resolve_bool_comparison(ir->operands[0], &op[0]);
1654 }
1655 op[0].type = BRW_REGISTER_TYPE_D;
1656 result_dst.type = BRW_REGISTER_TYPE_D;
1657 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1658 result_dst.type = BRW_REGISTER_TYPE_F;
1659 break;
1660 case ir_unop_f2b:
1661 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1662 break;
1663 case ir_unop_i2b:
1664 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1665 break;
1666
1667 case ir_unop_trunc:
1668 emit(RNDZ(result_dst, op[0]));
1669 break;
1670 case ir_unop_ceil: {
1671 src_reg tmp = src_reg(this, ir->type);
1672 op[0].negate = !op[0].negate;
1673 emit(RNDD(dst_reg(tmp), op[0]));
1674 tmp.negate = true;
1675 emit(MOV(result_dst, tmp));
1676 }
1677 break;
1678 case ir_unop_floor:
1679 inst = emit(RNDD(result_dst, op[0]));
1680 break;
1681 case ir_unop_fract:
1682 inst = emit(FRC(result_dst, op[0]));
1683 break;
1684 case ir_unop_round_even:
1685 emit(RNDE(result_dst, op[0]));
1686 break;
1687
1688 case ir_binop_min:
1689 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1690 break;
1691 case ir_binop_max:
1692 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1693 break;
1694
1695 case ir_binop_pow:
1696 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1697 break;
1698
1699 case ir_unop_bit_not:
1700 inst = emit(NOT(result_dst, op[0]));
1701 break;
1702 case ir_binop_bit_and:
1703 inst = emit(AND(result_dst, op[0], op[1]));
1704 break;
1705 case ir_binop_bit_xor:
1706 inst = emit(XOR(result_dst, op[0], op[1]));
1707 break;
1708 case ir_binop_bit_or:
1709 inst = emit(OR(result_dst, op[0], op[1]));
1710 break;
1711
1712 case ir_binop_lshift:
1713 inst = emit(SHL(result_dst, op[0], op[1]));
1714 break;
1715
1716 case ir_binop_rshift:
1717 if (ir->type->base_type == GLSL_TYPE_INT)
1718 inst = emit(ASR(result_dst, op[0], op[1]));
1719 else
1720 inst = emit(SHR(result_dst, op[0], op[1]));
1721 break;
1722
1723 case ir_binop_bfm:
1724 emit(BFI1(result_dst, op[0], op[1]));
1725 break;
1726
1727 case ir_binop_ubo_load: {
1728 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1729 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1730 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1731 src_reg offset;
1732
1733 /* Now, load the vector from that offset. */
1734 assert(ir->type->is_vector() || ir->type->is_scalar());
1735
1736 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1737 packed_consts.type = result.type;
1738 src_reg surf_index;
1739
1740 if (const_uniform_block) {
1741 /* The block index is a constant, so just emit the binding table entry
1742 * as an immediate.
1743 */
1744 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1745 const_uniform_block->value.u[0]);
1746 } else {
1747 /* The block index is not a constant. Evaluate the index expression
1748 * per-channel and add the base UBO index; the generator will select
1749 * a value from any live channel.
1750 */
1751 surf_index = src_reg(this, glsl_type::uint_type);
1752 emit(ADD(dst_reg(surf_index), op[0],
1753 src_reg(prog_data->base.binding_table.ubo_start)));
1754
1755 /* Assume this may touch any UBO. It would be nice to provide
1756 * a tighter bound, but the array information is already lowered away.
1757 */
1758 brw_mark_surface_used(&prog_data->base,
1759 prog_data->base.binding_table.ubo_start +
1760 shader_prog->NumUniformBlocks - 1);
1761 }
1762
1763 if (const_offset_ir) {
1764 if (brw->gen >= 8) {
1765 /* Store the offset in a GRF so we can send-from-GRF. */
1766 offset = src_reg(this, glsl_type::int_type);
1767 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1768 } else {
1769 /* Immediates are fine on older generations since they'll be moved
1770 * to a (potentially fake) MRF at the generator level.
1771 */
1772 offset = src_reg(const_offset / 16);
1773 }
1774 } else {
1775 offset = src_reg(this, glsl_type::uint_type);
1776 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1777 }
1778
1779 if (brw->gen >= 7) {
1780 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1781
1782 /* We have to use a message header on Skylake to get SIMD4x2 mode.
1783 * Reserve space for the register.
1784 */
1785 if (brw->gen >= 9) {
1786 grf_offset.reg_offset++;
1787 alloc.sizes[grf_offset.reg] = 2;
1788 }
1789
1790 grf_offset.type = offset.type;
1791
1792 emit(MOV(grf_offset, offset));
1793
1794 vec4_instruction *pull =
1795 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1796 dst_reg(packed_consts),
1797 surf_index,
1798 src_reg(grf_offset)));
1799 pull->mlen = 1;
1800 } else {
1801 vec4_instruction *pull =
1802 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1803 dst_reg(packed_consts),
1804 surf_index,
1805 offset));
1806 pull->base_mrf = 14;
1807 pull->mlen = 1;
1808 }
1809
1810 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1811 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1812 const_offset % 16 / 4,
1813 const_offset % 16 / 4,
1814 const_offset % 16 / 4);
1815
1816 /* UBO bools are any nonzero int. We need to convert them to use the
1817 * value of true stored in ctx->Const.UniformBooleanTrue.
1818 */
1819 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1820 emit(CMP(result_dst, packed_consts, src_reg(0u),
1821 BRW_CONDITIONAL_NZ));
1822 } else {
1823 emit(MOV(result_dst, packed_consts));
1824 }
1825 break;
1826 }
1827
1828 case ir_binop_vector_extract:
1829 unreachable("should have been lowered by vec_index_to_cond_assign");
1830
1831 case ir_triop_fma:
1832 op[0] = fix_3src_operand(op[0]);
1833 op[1] = fix_3src_operand(op[1]);
1834 op[2] = fix_3src_operand(op[2]);
1835 /* Note that the instruction's argument order is reversed from GLSL
1836 * and the IR.
1837 */
1838 emit(MAD(result_dst, op[2], op[1], op[0]));
1839 break;
1840
1841 case ir_triop_lrp:
1842 emit_lrp(result_dst, op[0], op[1], op[2]);
1843 break;
1844
1845 case ir_triop_csel:
1846 unreachable("already handled above");
1847 break;
1848
1849 case ir_triop_bfi:
1850 op[0] = fix_3src_operand(op[0]);
1851 op[1] = fix_3src_operand(op[1]);
1852 op[2] = fix_3src_operand(op[2]);
1853 emit(BFI2(result_dst, op[0], op[1], op[2]));
1854 break;
1855
1856 case ir_triop_bitfield_extract:
1857 op[0] = fix_3src_operand(op[0]);
1858 op[1] = fix_3src_operand(op[1]);
1859 op[2] = fix_3src_operand(op[2]);
1860 /* Note that the instruction's argument order is reversed from GLSL
1861 * and the IR.
1862 */
1863 emit(BFE(result_dst, op[2], op[1], op[0]));
1864 break;
1865
1866 case ir_triop_vector_insert:
1867 unreachable("should have been lowered by lower_vector_insert");
1868
1869 case ir_quadop_bitfield_insert:
1870 unreachable("not reached: should be handled by "
1871 "bitfield_insert_to_bfm_bfi\n");
1872
1873 case ir_quadop_vector:
1874 unreachable("not reached: should be handled by lower_quadop_vector");
1875
1876 case ir_unop_pack_half_2x16:
1877 emit_pack_half_2x16(result_dst, op[0]);
1878 break;
1879 case ir_unop_unpack_half_2x16:
1880 emit_unpack_half_2x16(result_dst, op[0]);
1881 break;
1882 case ir_unop_unpack_unorm_4x8:
1883 emit_unpack_unorm_4x8(result_dst, op[0]);
1884 break;
1885 case ir_unop_unpack_snorm_4x8:
1886 emit_unpack_snorm_4x8(result_dst, op[0]);
1887 break;
1888 case ir_unop_pack_unorm_4x8:
1889 emit_pack_unorm_4x8(result_dst, op[0]);
1890 break;
1891 case ir_unop_pack_snorm_4x8:
1892 emit_pack_snorm_4x8(result_dst, op[0]);
1893 break;
1894 case ir_unop_pack_snorm_2x16:
1895 case ir_unop_pack_unorm_2x16:
1896 case ir_unop_unpack_snorm_2x16:
1897 case ir_unop_unpack_unorm_2x16:
1898 unreachable("not reached: should be handled by lower_packing_builtins");
1899 case ir_unop_unpack_half_2x16_split_x:
1900 case ir_unop_unpack_half_2x16_split_y:
1901 case ir_binop_pack_half_2x16_split:
1902 case ir_unop_interpolate_at_centroid:
1903 case ir_binop_interpolate_at_sample:
1904 case ir_binop_interpolate_at_offset:
1905 unreachable("not reached: should not occur in vertex shader");
1906 case ir_binop_ldexp:
1907 unreachable("not reached: should be handled by ldexp_to_arith()");
1908 case ir_unop_d2f:
1909 case ir_unop_f2d:
1910 case ir_unop_d2i:
1911 case ir_unop_i2d:
1912 case ir_unop_d2u:
1913 case ir_unop_u2d:
1914 case ir_unop_d2b:
1915 case ir_unop_pack_double_2x32:
1916 case ir_unop_unpack_double_2x32:
1917 case ir_unop_frexp_sig:
1918 case ir_unop_frexp_exp:
1919 unreachable("fp64 todo");
1920 }
1921 }
1922
1923
1924 void
1925 vec4_visitor::visit(ir_swizzle *ir)
1926 {
1927 /* Note that this is only swizzles in expressions, not those on the left
1928 * hand side of an assignment, which do write masking. See ir_assignment
1929 * for that.
1930 */
1931 const unsigned swz = brw_compose_swizzle(
1932 brw_swizzle_for_size(ir->type->vector_elements),
1933 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
1934
1935 ir->val->accept(this);
1936 this->result = swizzle(this->result, swz);
1937 }
1938
1939 void
1940 vec4_visitor::visit(ir_dereference_variable *ir)
1941 {
1942 const struct glsl_type *type = ir->type;
1943 dst_reg *reg = variable_storage(ir->var);
1944
1945 if (!reg) {
1946 fail("Failed to find variable storage for %s\n", ir->var->name);
1947 this->result = src_reg(brw_null_reg());
1948 return;
1949 }
1950
1951 this->result = src_reg(*reg);
1952
1953 /* System values get their swizzle from the dst_reg writemask */
1954 if (ir->var->data.mode == ir_var_system_value)
1955 return;
1956
1957 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1958 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
1959 }
1960
1961
1962 int
1963 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1964 {
1965 /* Under normal circumstances array elements are stored consecutively, so
1966 * the stride is equal to the size of the array element.
1967 */
1968 return type_size(ir->type);
1969 }
1970
1971
1972 void
1973 vec4_visitor::visit(ir_dereference_array *ir)
1974 {
1975 ir_constant *constant_index;
1976 src_reg src;
1977 int array_stride = compute_array_stride(ir);
1978
1979 constant_index = ir->array_index->constant_expression_value();
1980
1981 ir->array->accept(this);
1982 src = this->result;
1983
1984 if (constant_index) {
1985 src.reg_offset += constant_index->value.i[0] * array_stride;
1986 } else {
1987 /* Variable index array dereference. It eats the "vec4" of the
1988 * base of the array and an index that offsets the Mesa register
1989 * index.
1990 */
1991 ir->array_index->accept(this);
1992
1993 src_reg index_reg;
1994
1995 if (array_stride == 1) {
1996 index_reg = this->result;
1997 } else {
1998 index_reg = src_reg(this, glsl_type::int_type);
1999
2000 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2001 }
2002
2003 if (src.reladdr) {
2004 src_reg temp = src_reg(this, glsl_type::int_type);
2005
2006 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2007
2008 index_reg = temp;
2009 }
2010
2011 src.reladdr = ralloc(mem_ctx, src_reg);
2012 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2013 }
2014
2015 /* If the type is smaller than a vec4, replicate the last channel out. */
2016 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2017 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2018 else
2019 src.swizzle = BRW_SWIZZLE_NOOP;
2020 src.type = brw_type_for_base_type(ir->type);
2021
2022 this->result = src;
2023 }
2024
2025 void
2026 vec4_visitor::visit(ir_dereference_record *ir)
2027 {
2028 unsigned int i;
2029 const glsl_type *struct_type = ir->record->type;
2030 int offset = 0;
2031
2032 ir->record->accept(this);
2033
2034 for (i = 0; i < struct_type->length; i++) {
2035 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2036 break;
2037 offset += type_size(struct_type->fields.structure[i].type);
2038 }
2039
2040 /* If the type is smaller than a vec4, replicate the last channel out. */
2041 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2042 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2043 else
2044 this->result.swizzle = BRW_SWIZZLE_NOOP;
2045 this->result.type = brw_type_for_base_type(ir->type);
2046
2047 this->result.reg_offset += offset;
2048 }
2049
2050 /**
2051 * We want to be careful in assignment setup to hit the actual storage
2052 * instead of potentially using a temporary like we might with the
2053 * ir_dereference handler.
2054 */
2055 static dst_reg
2056 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2057 {
2058 /* The LHS must be a dereference. If the LHS is a variable indexed array
2059 * access of a vector, it must be separated into a series conditional moves
2060 * before reaching this point (see ir_vec_index_to_cond_assign).
2061 */
2062 assert(ir->as_dereference());
2063 ir_dereference_array *deref_array = ir->as_dereference_array();
2064 if (deref_array) {
2065 assert(!deref_array->array->type->is_vector());
2066 }
2067
2068 /* Use the rvalue deref handler for the most part. We'll ignore
2069 * swizzles in it and write swizzles using writemask, though.
2070 */
2071 ir->accept(v);
2072 return dst_reg(v->result);
2073 }
2074
2075 void
2076 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2077 const struct glsl_type *type,
2078 enum brw_predicate predicate)
2079 {
2080 if (type->base_type == GLSL_TYPE_STRUCT) {
2081 for (unsigned int i = 0; i < type->length; i++) {
2082 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2083 }
2084 return;
2085 }
2086
2087 if (type->is_array()) {
2088 for (unsigned int i = 0; i < type->length; i++) {
2089 emit_block_move(dst, src, type->fields.array, predicate);
2090 }
2091 return;
2092 }
2093
2094 if (type->is_matrix()) {
2095 const struct glsl_type *vec_type;
2096
2097 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2098 type->vector_elements, 1);
2099
2100 for (int i = 0; i < type->matrix_columns; i++) {
2101 emit_block_move(dst, src, vec_type, predicate);
2102 }
2103 return;
2104 }
2105
2106 assert(type->is_scalar() || type->is_vector());
2107
2108 dst->type = brw_type_for_base_type(type);
2109 src->type = dst->type;
2110
2111 dst->writemask = (1 << type->vector_elements) - 1;
2112
2113 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2114
2115 vec4_instruction *inst = emit(MOV(*dst, *src));
2116 inst->predicate = predicate;
2117
2118 dst->reg_offset++;
2119 src->reg_offset++;
2120 }
2121
2122
2123 /* If the RHS processing resulted in an instruction generating a
2124 * temporary value, and it would be easy to rewrite the instruction to
2125 * generate its result right into the LHS instead, do so. This ends
2126 * up reliably removing instructions where it can be tricky to do so
2127 * later without real UD chain information.
2128 */
2129 bool
2130 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2131 dst_reg dst,
2132 src_reg src,
2133 vec4_instruction *pre_rhs_inst,
2134 vec4_instruction *last_rhs_inst)
2135 {
2136 /* This could be supported, but it would take more smarts. */
2137 if (ir->condition)
2138 return false;
2139
2140 if (pre_rhs_inst == last_rhs_inst)
2141 return false; /* No instructions generated to work with. */
2142
2143 /* Make sure the last instruction generated our source reg. */
2144 if (src.file != GRF ||
2145 src.file != last_rhs_inst->dst.file ||
2146 src.reg != last_rhs_inst->dst.reg ||
2147 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2148 src.reladdr ||
2149 src.abs ||
2150 src.negate ||
2151 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2152 return false;
2153
2154 /* Check that that last instruction fully initialized the channels
2155 * we want to use, in the order we want to use them. We could
2156 * potentially reswizzle the operands of many instructions so that
2157 * we could handle out of order channels, but don't yet.
2158 */
2159
2160 for (unsigned i = 0; i < 4; i++) {
2161 if (dst.writemask & (1 << i)) {
2162 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2163 return false;
2164
2165 if (BRW_GET_SWZ(src.swizzle, i) != i)
2166 return false;
2167 }
2168 }
2169
2170 /* Success! Rewrite the instruction. */
2171 last_rhs_inst->dst.file = dst.file;
2172 last_rhs_inst->dst.reg = dst.reg;
2173 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2174 last_rhs_inst->dst.reladdr = dst.reladdr;
2175 last_rhs_inst->dst.writemask &= dst.writemask;
2176
2177 return true;
2178 }
2179
2180 void
2181 vec4_visitor::visit(ir_assignment *ir)
2182 {
2183 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2184 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2185
2186 if (!ir->lhs->type->is_scalar() &&
2187 !ir->lhs->type->is_vector()) {
2188 ir->rhs->accept(this);
2189 src_reg src = this->result;
2190
2191 if (ir->condition) {
2192 emit_bool_to_cond_code(ir->condition, &predicate);
2193 }
2194
2195 /* emit_block_move doesn't account for swizzles in the source register.
2196 * This should be ok, since the source register is a structure or an
2197 * array, and those can't be swizzled. But double-check to be sure.
2198 */
2199 assert(src.swizzle ==
2200 (ir->rhs->type->is_matrix()
2201 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2202 : BRW_SWIZZLE_NOOP));
2203
2204 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2205 return;
2206 }
2207
2208 /* Now we're down to just a scalar/vector with writemasks. */
2209 int i;
2210
2211 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2212 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2213
2214 ir->rhs->accept(this);
2215
2216 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2217
2218 int swizzles[4];
2219 int src_chan = 0;
2220
2221 assert(ir->lhs->type->is_vector() ||
2222 ir->lhs->type->is_scalar());
2223 dst.writemask = ir->write_mask;
2224
2225 /* Swizzle a small RHS vector into the channels being written.
2226 *
2227 * glsl ir treats write_mask as dictating how many channels are
2228 * present on the RHS while in our instructions we need to make
2229 * those channels appear in the slots of the vec4 they're written to.
2230 */
2231 for (int i = 0; i < 4; i++)
2232 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2233
2234 src_reg src = swizzle(this->result,
2235 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2236 swizzles[2], swizzles[3]));
2237
2238 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2239 return;
2240 }
2241
2242 if (ir->condition) {
2243 emit_bool_to_cond_code(ir->condition, &predicate);
2244 }
2245
2246 for (i = 0; i < type_size(ir->lhs->type); i++) {
2247 vec4_instruction *inst = emit(MOV(dst, src));
2248 inst->predicate = predicate;
2249
2250 dst.reg_offset++;
2251 src.reg_offset++;
2252 }
2253 }
2254
2255 void
2256 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2257 {
2258 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2259 foreach_in_list(ir_constant, field_value, &ir->components) {
2260 emit_constant_values(dst, field_value);
2261 }
2262 return;
2263 }
2264
2265 if (ir->type->is_array()) {
2266 for (unsigned int i = 0; i < ir->type->length; i++) {
2267 emit_constant_values(dst, ir->array_elements[i]);
2268 }
2269 return;
2270 }
2271
2272 if (ir->type->is_matrix()) {
2273 for (int i = 0; i < ir->type->matrix_columns; i++) {
2274 float *vec = &ir->value.f[i * ir->type->vector_elements];
2275
2276 for (int j = 0; j < ir->type->vector_elements; j++) {
2277 dst->writemask = 1 << j;
2278 dst->type = BRW_REGISTER_TYPE_F;
2279
2280 emit(MOV(*dst, src_reg(vec[j])));
2281 }
2282 dst->reg_offset++;
2283 }
2284 return;
2285 }
2286
2287 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2288
2289 for (int i = 0; i < ir->type->vector_elements; i++) {
2290 if (!(remaining_writemask & (1 << i)))
2291 continue;
2292
2293 dst->writemask = 1 << i;
2294 dst->type = brw_type_for_base_type(ir->type);
2295
2296 /* Find other components that match the one we're about to
2297 * write. Emits fewer instructions for things like vec4(0.5,
2298 * 1.5, 1.5, 1.5).
2299 */
2300 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2301 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2302 if (ir->value.b[i] == ir->value.b[j])
2303 dst->writemask |= (1 << j);
2304 } else {
2305 /* u, i, and f storage all line up, so no need for a
2306 * switch case for comparing each type.
2307 */
2308 if (ir->value.u[i] == ir->value.u[j])
2309 dst->writemask |= (1 << j);
2310 }
2311 }
2312
2313 switch (ir->type->base_type) {
2314 case GLSL_TYPE_FLOAT:
2315 emit(MOV(*dst, src_reg(ir->value.f[i])));
2316 break;
2317 case GLSL_TYPE_INT:
2318 emit(MOV(*dst, src_reg(ir->value.i[i])));
2319 break;
2320 case GLSL_TYPE_UINT:
2321 emit(MOV(*dst, src_reg(ir->value.u[i])));
2322 break;
2323 case GLSL_TYPE_BOOL:
2324 emit(MOV(*dst,
2325 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2326 : 0)));
2327 break;
2328 default:
2329 unreachable("Non-float/uint/int/bool constant");
2330 }
2331
2332 remaining_writemask &= ~dst->writemask;
2333 }
2334 dst->reg_offset++;
2335 }
2336
2337 void
2338 vec4_visitor::visit(ir_constant *ir)
2339 {
2340 dst_reg dst = dst_reg(this, ir->type);
2341 this->result = src_reg(dst);
2342
2343 emit_constant_values(&dst, ir);
2344 }
2345
2346 void
2347 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2348 {
2349 ir_dereference *deref = static_cast<ir_dereference *>(
2350 ir->actual_parameters.get_head());
2351 ir_variable *location = deref->variable_referenced();
2352 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2353 location->data.binding);
2354
2355 /* Calculate the surface offset */
2356 src_reg offset(this, glsl_type::uint_type);
2357 ir_dereference_array *deref_array = deref->as_dereference_array();
2358 if (deref_array) {
2359 deref_array->array_index->accept(this);
2360
2361 src_reg tmp(this, glsl_type::uint_type);
2362 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2363 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2364 } else {
2365 offset = location->data.atomic.offset;
2366 }
2367
2368 /* Emit the appropriate machine instruction */
2369 const char *callee = ir->callee->function_name();
2370 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2371
2372 if (!strcmp("__intrinsic_atomic_read", callee)) {
2373 emit_untyped_surface_read(surf_index, dst, offset);
2374
2375 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2376 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2377 src_reg(), src_reg());
2378
2379 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2380 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2381 src_reg(), src_reg());
2382 }
2383 }
2384
2385 void
2386 vec4_visitor::visit(ir_call *ir)
2387 {
2388 const char *callee = ir->callee->function_name();
2389
2390 if (!strcmp("__intrinsic_atomic_read", callee) ||
2391 !strcmp("__intrinsic_atomic_increment", callee) ||
2392 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2393 visit_atomic_counter_intrinsic(ir);
2394 } else {
2395 unreachable("Unsupported intrinsic.");
2396 }
2397 }
2398
2399 src_reg
2400 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2401 {
2402 vec4_instruction *inst =
2403 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2404 dst_reg(this, glsl_type::uvec4_type));
2405 inst->base_mrf = 2;
2406 inst->mlen = 1;
2407 inst->src[1] = sampler;
2408
2409 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2410 int param_base = inst->base_mrf;
2411 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2412 int zero_mask = 0xf & ~coord_mask;
2413
2414 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2415 coordinate));
2416
2417 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2418 src_reg(0)));
2419
2420 emit(inst);
2421 return src_reg(inst->dst);
2422 }
2423
2424 static bool
2425 is_high_sampler(struct brw_context *brw, src_reg sampler)
2426 {
2427 if (brw->gen < 8 && !brw->is_haswell)
2428 return false;
2429
2430 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2431 }
2432
2433 void
2434 vec4_visitor::visit(ir_texture *ir)
2435 {
2436 uint32_t sampler =
2437 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2438
2439 ir_rvalue *nonconst_sampler_index =
2440 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2441
2442 /* Handle non-constant sampler array indexing */
2443 src_reg sampler_reg;
2444 if (nonconst_sampler_index) {
2445 /* The highest sampler which may be used by this operation is
2446 * the last element of the array. Mark it here, because the generator
2447 * doesn't have enough information to determine the bound.
2448 */
2449 uint32_t array_size = ir->sampler->as_dereference_array()
2450 ->array->type->array_size();
2451
2452 uint32_t max_used = sampler + array_size - 1;
2453 if (ir->op == ir_tg4 && brw->gen < 8) {
2454 max_used += prog_data->base.binding_table.gather_texture_start;
2455 } else {
2456 max_used += prog_data->base.binding_table.texture_start;
2457 }
2458
2459 brw_mark_surface_used(&prog_data->base, max_used);
2460
2461 /* Emit code to evaluate the actual indexing expression */
2462 nonconst_sampler_index->accept(this);
2463 dst_reg temp(this, glsl_type::uint_type);
2464 emit(ADD(temp, this->result, src_reg(sampler)))
2465 ->force_writemask_all = true;
2466 sampler_reg = src_reg(temp);
2467 } else {
2468 /* Single sampler, or constant array index; the indexing expression
2469 * is just an immediate.
2470 */
2471 sampler_reg = src_reg(sampler);
2472 }
2473
2474 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2475 * emitting anything other than setting up the constant result.
2476 */
2477 if (ir->op == ir_tg4) {
2478 ir_constant *chan = ir->lod_info.component->as_constant();
2479 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2480 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2481 dst_reg result(this, ir->type);
2482 this->result = src_reg(result);
2483 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2484 return;
2485 }
2486 }
2487
2488 /* Should be lowered by do_lower_texture_projection */
2489 assert(!ir->projector);
2490
2491 /* Should be lowered */
2492 assert(!ir->offset || !ir->offset->type->is_array());
2493
2494 /* Generate code to compute all the subexpression trees. This has to be
2495 * done before loading any values into MRFs for the sampler message since
2496 * generating these values may involve SEND messages that need the MRFs.
2497 */
2498 src_reg coordinate;
2499 if (ir->coordinate) {
2500 ir->coordinate->accept(this);
2501 coordinate = this->result;
2502 }
2503
2504 src_reg shadow_comparitor;
2505 if (ir->shadow_comparitor) {
2506 ir->shadow_comparitor->accept(this);
2507 shadow_comparitor = this->result;
2508 }
2509
2510 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2511 src_reg offset_value;
2512 if (has_nonconstant_offset) {
2513 ir->offset->accept(this);
2514 offset_value = src_reg(this->result);
2515 }
2516
2517 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2518 src_reg lod, dPdx, dPdy, sample_index, mcs;
2519 switch (ir->op) {
2520 case ir_tex:
2521 lod = src_reg(0.0f);
2522 lod_type = glsl_type::float_type;
2523 break;
2524 case ir_txf:
2525 case ir_txl:
2526 case ir_txs:
2527 ir->lod_info.lod->accept(this);
2528 lod = this->result;
2529 lod_type = ir->lod_info.lod->type;
2530 break;
2531 case ir_query_levels:
2532 lod = src_reg(0);
2533 lod_type = glsl_type::int_type;
2534 break;
2535 case ir_txf_ms:
2536 ir->lod_info.sample_index->accept(this);
2537 sample_index = this->result;
2538 sample_index_type = ir->lod_info.sample_index->type;
2539
2540 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2541 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2542 else
2543 mcs = src_reg(0u);
2544 break;
2545 case ir_txd:
2546 ir->lod_info.grad.dPdx->accept(this);
2547 dPdx = this->result;
2548
2549 ir->lod_info.grad.dPdy->accept(this);
2550 dPdy = this->result;
2551
2552 lod_type = ir->lod_info.grad.dPdx->type;
2553 break;
2554 case ir_txb:
2555 case ir_lod:
2556 case ir_tg4:
2557 break;
2558 }
2559
2560 enum opcode opcode;
2561 switch (ir->op) {
2562 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2563 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2564 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2565 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2566 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2567 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2568 case ir_tg4: opcode = has_nonconstant_offset
2569 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2570 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2571 case ir_txb:
2572 unreachable("TXB is not valid for vertex shaders.");
2573 case ir_lod:
2574 unreachable("LOD is not valid for vertex shaders.");
2575 default:
2576 unreachable("Unrecognized tex op");
2577 }
2578
2579 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2580 opcode, dst_reg(this, ir->type));
2581
2582 if (ir->offset != NULL && !has_nonconstant_offset) {
2583 inst->offset =
2584 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2585 ir->offset->type->vector_elements);
2586 }
2587
2588 /* Stuff the channel select bits in the top of the texture offset */
2589 if (ir->op == ir_tg4)
2590 inst->offset |= gather_channel(ir, sampler) << 16;
2591
2592 /* The message header is necessary for:
2593 * - Gen4 (always)
2594 * - Gen9+ for selecting SIMD4x2
2595 * - Texel offsets
2596 * - Gather channel selection
2597 * - Sampler indices too large to fit in a 4-bit value.
2598 */
2599 inst->header_present =
2600 brw->gen < 5 || brw->gen >= 9 ||
2601 inst->offset != 0 || ir->op == ir_tg4 ||
2602 is_high_sampler(brw, sampler_reg);
2603 inst->base_mrf = 2;
2604 inst->mlen = inst->header_present + 1; /* always at least one */
2605 inst->dst.writemask = WRITEMASK_XYZW;
2606 inst->shadow_compare = ir->shadow_comparitor != NULL;
2607
2608 inst->src[1] = sampler_reg;
2609
2610 /* MRF for the first parameter */
2611 int param_base = inst->base_mrf + inst->header_present;
2612
2613 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2614 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2615 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2616 } else {
2617 /* Load the coordinate */
2618 /* FINISHME: gl_clamp_mask and saturate */
2619 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2620 int zero_mask = 0xf & ~coord_mask;
2621
2622 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2623 coordinate));
2624
2625 if (zero_mask != 0) {
2626 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2627 src_reg(0)));
2628 }
2629 /* Load the shadow comparitor */
2630 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2631 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2632 WRITEMASK_X),
2633 shadow_comparitor));
2634 inst->mlen++;
2635 }
2636
2637 /* Load the LOD info */
2638 if (ir->op == ir_tex || ir->op == ir_txl) {
2639 int mrf, writemask;
2640 if (brw->gen >= 5) {
2641 mrf = param_base + 1;
2642 if (ir->shadow_comparitor) {
2643 writemask = WRITEMASK_Y;
2644 /* mlen already incremented */
2645 } else {
2646 writemask = WRITEMASK_X;
2647 inst->mlen++;
2648 }
2649 } else /* brw->gen == 4 */ {
2650 mrf = param_base;
2651 writemask = WRITEMASK_W;
2652 }
2653 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2654 } else if (ir->op == ir_txf) {
2655 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2656 } else if (ir->op == ir_txf_ms) {
2657 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2658 sample_index));
2659 if (brw->gen >= 7) {
2660 /* MCS data is in the first channel of `mcs`, but we need to get it into
2661 * the .y channel of the second vec4 of params, so replicate .x across
2662 * the whole vec4 and then mask off everything except .y
2663 */
2664 mcs.swizzle = BRW_SWIZZLE_XXXX;
2665 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2666 mcs));
2667 }
2668 inst->mlen++;
2669 } else if (ir->op == ir_txd) {
2670 const glsl_type *type = lod_type;
2671
2672 if (brw->gen >= 5) {
2673 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2674 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2675 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2676 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2677 inst->mlen++;
2678
2679 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2680 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2681 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2682 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2683 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2684 inst->mlen++;
2685
2686 if (ir->shadow_comparitor) {
2687 emit(MOV(dst_reg(MRF, param_base + 2,
2688 ir->shadow_comparitor->type, WRITEMASK_Z),
2689 shadow_comparitor));
2690 }
2691 }
2692 } else /* brw->gen == 4 */ {
2693 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2694 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2695 inst->mlen += 2;
2696 }
2697 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2698 if (ir->shadow_comparitor) {
2699 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2700 shadow_comparitor));
2701 }
2702
2703 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2704 offset_value));
2705 inst->mlen++;
2706 }
2707 }
2708
2709 emit(inst);
2710
2711 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2712 * spec requires layers.
2713 */
2714 if (ir->op == ir_txs) {
2715 glsl_type const *type = ir->sampler->type;
2716 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2717 type->sampler_array) {
2718 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2719 writemask(inst->dst, WRITEMASK_Z),
2720 src_reg(inst->dst), src_reg(6));
2721 }
2722 }
2723
2724 if (brw->gen == 6 && ir->op == ir_tg4) {
2725 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2726 }
2727
2728 swizzle_result(ir, src_reg(inst->dst), sampler);
2729 }
2730
2731 /**
2732 * Apply workarounds for Gen6 gather with UINT/SINT
2733 */
2734 void
2735 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2736 {
2737 if (!wa)
2738 return;
2739
2740 int width = (wa & WA_8BIT) ? 8 : 16;
2741 dst_reg dst_f = dst;
2742 dst_f.type = BRW_REGISTER_TYPE_F;
2743
2744 /* Convert from UNORM to UINT */
2745 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2746 emit(MOV(dst, src_reg(dst_f)));
2747
2748 if (wa & WA_SIGN) {
2749 /* Reinterpret the UINT value as a signed INT value by
2750 * shifting the sign bit into place, then shifting back
2751 * preserving sign.
2752 */
2753 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2754 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2755 }
2756 }
2757
2758 /**
2759 * Set up the gather channel based on the swizzle, for gather4.
2760 */
2761 uint32_t
2762 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2763 {
2764 ir_constant *chan = ir->lod_info.component->as_constant();
2765 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2766 switch (swiz) {
2767 case SWIZZLE_X: return 0;
2768 case SWIZZLE_Y:
2769 /* gather4 sampler is broken for green channel on RG32F --
2770 * we must ask for blue instead.
2771 */
2772 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2773 return 2;
2774 return 1;
2775 case SWIZZLE_Z: return 2;
2776 case SWIZZLE_W: return 3;
2777 default:
2778 unreachable("Not reached"); /* zero, one swizzles handled already */
2779 }
2780 }
2781
2782 void
2783 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2784 {
2785 int s = key->tex.swizzles[sampler];
2786
2787 this->result = src_reg(this, ir->type);
2788 dst_reg swizzled_result(this->result);
2789
2790 if (ir->op == ir_query_levels) {
2791 /* # levels is in .w */
2792 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2793 emit(MOV(swizzled_result, orig_val));
2794 return;
2795 }
2796
2797 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2798 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2799 emit(MOV(swizzled_result, orig_val));
2800 return;
2801 }
2802
2803
2804 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2805 int swizzle[4] = {0};
2806
2807 for (int i = 0; i < 4; i++) {
2808 switch (GET_SWZ(s, i)) {
2809 case SWIZZLE_ZERO:
2810 zero_mask |= (1 << i);
2811 break;
2812 case SWIZZLE_ONE:
2813 one_mask |= (1 << i);
2814 break;
2815 default:
2816 copy_mask |= (1 << i);
2817 swizzle[i] = GET_SWZ(s, i);
2818 break;
2819 }
2820 }
2821
2822 if (copy_mask) {
2823 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2824 swizzled_result.writemask = copy_mask;
2825 emit(MOV(swizzled_result, orig_val));
2826 }
2827
2828 if (zero_mask) {
2829 swizzled_result.writemask = zero_mask;
2830 emit(MOV(swizzled_result, src_reg(0.0f)));
2831 }
2832
2833 if (one_mask) {
2834 swizzled_result.writemask = one_mask;
2835 emit(MOV(swizzled_result, src_reg(1.0f)));
2836 }
2837 }
2838
2839 void
2840 vec4_visitor::visit(ir_return *)
2841 {
2842 unreachable("not reached");
2843 }
2844
2845 void
2846 vec4_visitor::visit(ir_discard *)
2847 {
2848 unreachable("not reached");
2849 }
2850
2851 void
2852 vec4_visitor::visit(ir_if *ir)
2853 {
2854 /* Don't point the annotation at the if statement, because then it plus
2855 * the then and else blocks get printed.
2856 */
2857 this->base_ir = ir->condition;
2858
2859 if (brw->gen == 6) {
2860 emit_if_gen6(ir);
2861 } else {
2862 enum brw_predicate predicate;
2863 emit_bool_to_cond_code(ir->condition, &predicate);
2864 emit(IF(predicate));
2865 }
2866
2867 visit_instructions(&ir->then_instructions);
2868
2869 if (!ir->else_instructions.is_empty()) {
2870 this->base_ir = ir->condition;
2871 emit(BRW_OPCODE_ELSE);
2872
2873 visit_instructions(&ir->else_instructions);
2874 }
2875
2876 this->base_ir = ir->condition;
2877 emit(BRW_OPCODE_ENDIF);
2878 }
2879
2880 void
2881 vec4_visitor::visit(ir_emit_vertex *)
2882 {
2883 unreachable("not reached");
2884 }
2885
2886 void
2887 vec4_visitor::visit(ir_end_primitive *)
2888 {
2889 unreachable("not reached");
2890 }
2891
2892 void
2893 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2894 dst_reg dst, src_reg offset,
2895 src_reg src0, src_reg src1)
2896 {
2897 unsigned mlen = 0;
2898
2899 /* Set the atomic operation offset. */
2900 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2901 mlen++;
2902
2903 /* Set the atomic operation arguments. */
2904 if (src0.file != BAD_FILE) {
2905 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2906 mlen++;
2907 }
2908
2909 if (src1.file != BAD_FILE) {
2910 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2911 mlen++;
2912 }
2913
2914 /* Emit the instruction. Note that this maps to the normal SIMD8
2915 * untyped atomic message on Ivy Bridge, but that's OK because
2916 * unused channels will be masked out.
2917 */
2918 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2919 src_reg(atomic_op), src_reg(surf_index));
2920 inst->base_mrf = 0;
2921 inst->mlen = mlen;
2922 }
2923
2924 void
2925 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2926 src_reg offset)
2927 {
2928 /* Set the surface read offset. */
2929 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2930
2931 /* Emit the instruction. Note that this maps to the normal SIMD8
2932 * untyped surface read message, but that's OK because unused
2933 * channels will be masked out.
2934 */
2935 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2936 dst, src_reg(surf_index));
2937 inst->base_mrf = 0;
2938 inst->mlen = 1;
2939 }
2940
2941 void
2942 vec4_visitor::emit_ndc_computation()
2943 {
2944 /* Get the position */
2945 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2946
2947 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2948 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2949 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2950
2951 current_annotation = "NDC";
2952 dst_reg ndc_w = ndc;
2953 ndc_w.writemask = WRITEMASK_W;
2954 src_reg pos_w = pos;
2955 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2956 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2957
2958 dst_reg ndc_xyz = ndc;
2959 ndc_xyz.writemask = WRITEMASK_XYZ;
2960
2961 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2962 }
2963
2964 void
2965 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2966 {
2967 if (brw->gen < 6 &&
2968 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2969 key->userclip_active || brw->has_negative_rhw_bug)) {
2970 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2971 dst_reg header1_w = header1;
2972 header1_w.writemask = WRITEMASK_W;
2973
2974 emit(MOV(header1, 0u));
2975
2976 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2977 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2978
2979 current_annotation = "Point size";
2980 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2981 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2982 }
2983
2984 if (key->userclip_active) {
2985 current_annotation = "Clipping flags";
2986 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2987 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2988
2989 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2990 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2991 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2992
2993 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2994 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2995 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2996 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2997 }
2998
2999 /* i965 clipping workaround:
3000 * 1) Test for -ve rhw
3001 * 2) If set,
3002 * set ndc = (0,0,0,0)
3003 * set ucp[6] = 1
3004 *
3005 * Later, clipping will detect ucp[6] and ensure the primitive is
3006 * clipped against all fixed planes.
3007 */
3008 if (brw->has_negative_rhw_bug) {
3009 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3010 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3011 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3012 vec4_instruction *inst;
3013 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3014 inst->predicate = BRW_PREDICATE_NORMAL;
3015 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3016 inst->predicate = BRW_PREDICATE_NORMAL;
3017 }
3018
3019 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3020 } else if (brw->gen < 6) {
3021 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3022 } else {
3023 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3024 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3025 dst_reg reg_w = reg;
3026 reg_w.writemask = WRITEMASK_W;
3027 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3028 }
3029 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3030 dst_reg reg_y = reg;
3031 reg_y.writemask = WRITEMASK_Y;
3032 reg_y.type = BRW_REGISTER_TYPE_D;
3033 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3034 }
3035 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3036 dst_reg reg_z = reg;
3037 reg_z.writemask = WRITEMASK_Z;
3038 reg_z.type = BRW_REGISTER_TYPE_D;
3039 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3040 }
3041 }
3042 }
3043
3044 void
3045 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3046 {
3047 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3048 *
3049 * "If a linked set of shaders forming the vertex stage contains no
3050 * static write to gl_ClipVertex or gl_ClipDistance, but the
3051 * application has requested clipping against user clip planes through
3052 * the API, then the coordinate written to gl_Position is used for
3053 * comparison against the user clip planes."
3054 *
3055 * This function is only called if the shader didn't write to
3056 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3057 * if the user wrote to it; otherwise we use gl_Position.
3058 */
3059 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3060 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3061 clip_vertex = VARYING_SLOT_POS;
3062 }
3063
3064 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3065 ++i) {
3066 reg.writemask = 1 << i;
3067 emit(DP4(reg,
3068 src_reg(output_reg[clip_vertex]),
3069 src_reg(this->userplane[i + offset])));
3070 }
3071 }
3072
3073 vec4_instruction *
3074 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3075 {
3076 assert (varying < VARYING_SLOT_MAX);
3077 reg.type = output_reg[varying].type;
3078 current_annotation = output_reg_annotation[varying];
3079 /* Copy the register, saturating if necessary */
3080 return emit(MOV(reg, src_reg(output_reg[varying])));
3081 }
3082
3083 void
3084 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3085 {
3086 reg.type = BRW_REGISTER_TYPE_F;
3087
3088 switch (varying) {
3089 case VARYING_SLOT_PSIZ:
3090 {
3091 /* PSIZ is always in slot 0, and is coupled with other flags. */
3092 current_annotation = "indices, point width, clip flags";
3093 emit_psiz_and_flags(reg);
3094 break;
3095 }
3096 case BRW_VARYING_SLOT_NDC:
3097 current_annotation = "NDC";
3098 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3099 break;
3100 case VARYING_SLOT_POS:
3101 current_annotation = "gl_Position";
3102 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3103 break;
3104 case VARYING_SLOT_EDGE:
3105 /* This is present when doing unfilled polygons. We're supposed to copy
3106 * the edge flag from the user-provided vertex array
3107 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3108 * of that attribute (starts as 1.0f). This is then used in clipping to
3109 * determine which edges should be drawn as wireframe.
3110 */
3111 current_annotation = "edge flag";
3112 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3113 glsl_type::float_type, WRITEMASK_XYZW))));
3114 break;
3115 case BRW_VARYING_SLOT_PAD:
3116 /* No need to write to this slot */
3117 break;
3118 case VARYING_SLOT_COL0:
3119 case VARYING_SLOT_COL1:
3120 case VARYING_SLOT_BFC0:
3121 case VARYING_SLOT_BFC1: {
3122 /* These built-in varyings are only supported in compatibility mode,
3123 * and we only support GS in core profile. So, this must be a vertex
3124 * shader.
3125 */
3126 assert(stage == MESA_SHADER_VERTEX);
3127 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3128 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3129 inst->saturate = true;
3130 break;
3131 }
3132
3133 default:
3134 emit_generic_urb_slot(reg, varying);
3135 break;
3136 }
3137 }
3138
3139 static int
3140 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3141 {
3142 if (brw->gen >= 6) {
3143 /* URB data written (does not include the message header reg) must
3144 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3145 * section 5.4.3.2.2: URB_INTERLEAVED.
3146 *
3147 * URB entries are allocated on a multiple of 1024 bits, so an
3148 * extra 128 bits written here to make the end align to 256 is
3149 * no problem.
3150 */
3151 if ((mlen % 2) != 1)
3152 mlen++;
3153 }
3154
3155 return mlen;
3156 }
3157
3158
3159 /**
3160 * Generates the VUE payload plus the necessary URB write instructions to
3161 * output it.
3162 *
3163 * The VUE layout is documented in Volume 2a.
3164 */
3165 void
3166 vec4_visitor::emit_vertex()
3167 {
3168 /* MRF 0 is reserved for the debugger, so start with message header
3169 * in MRF 1.
3170 */
3171 int base_mrf = 1;
3172 int mrf = base_mrf;
3173 /* In the process of generating our URB write message contents, we
3174 * may need to unspill a register or load from an array. Those
3175 * reads would use MRFs 14-15.
3176 */
3177 int max_usable_mrf = 13;
3178
3179 /* The following assertion verifies that max_usable_mrf causes an
3180 * even-numbered amount of URB write data, which will meet gen6's
3181 * requirements for length alignment.
3182 */
3183 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3184
3185 /* First mrf is the g0-based message header containing URB handles and
3186 * such.
3187 */
3188 emit_urb_write_header(mrf++);
3189
3190 if (brw->gen < 6) {
3191 emit_ndc_computation();
3192 }
3193
3194 /* Lower legacy ff and ClipVertex clipping to clip distances */
3195 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3196 current_annotation = "user clip distances";
3197
3198 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3199 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3200
3201 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3202 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3203 }
3204
3205 /* We may need to split this up into several URB writes, so do them in a
3206 * loop.
3207 */
3208 int slot = 0;
3209 bool complete = false;
3210 do {
3211 /* URB offset is in URB row increments, and each of our MRFs is half of
3212 * one of those, since we're doing interleaved writes.
3213 */
3214 int offset = slot / 2;
3215
3216 mrf = base_mrf + 1;
3217 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3218 emit_urb_slot(dst_reg(MRF, mrf++),
3219 prog_data->vue_map.slot_to_varying[slot]);
3220
3221 /* If this was max_usable_mrf, we can't fit anything more into this
3222 * URB WRITE.
3223 */
3224 if (mrf > max_usable_mrf) {
3225 slot++;
3226 break;
3227 }
3228 }
3229
3230 complete = slot >= prog_data->vue_map.num_slots;
3231 current_annotation = "URB write";
3232 vec4_instruction *inst = emit_urb_write_opcode(complete);
3233 inst->base_mrf = base_mrf;
3234 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3235 inst->offset += offset;
3236 } while(!complete);
3237 }
3238
3239
3240 src_reg
3241 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3242 src_reg *reladdr, int reg_offset)
3243 {
3244 /* Because we store the values to scratch interleaved like our
3245 * vertex data, we need to scale the vec4 index by 2.
3246 */
3247 int message_header_scale = 2;
3248
3249 /* Pre-gen6, the message header uses byte offsets instead of vec4
3250 * (16-byte) offset units.
3251 */
3252 if (brw->gen < 6)
3253 message_header_scale *= 16;
3254
3255 if (reladdr) {
3256 src_reg index = src_reg(this, glsl_type::int_type);
3257
3258 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3259 src_reg(reg_offset)));
3260 emit_before(block, inst, MUL(dst_reg(index), index,
3261 src_reg(message_header_scale)));
3262
3263 return index;
3264 } else {
3265 return src_reg(reg_offset * message_header_scale);
3266 }
3267 }
3268
3269 src_reg
3270 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3271 src_reg *reladdr, int reg_offset)
3272 {
3273 if (reladdr) {
3274 src_reg index = src_reg(this, glsl_type::int_type);
3275
3276 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3277 src_reg(reg_offset)));
3278
3279 /* Pre-gen6, the message header uses byte offsets instead of vec4
3280 * (16-byte) offset units.
3281 */
3282 if (brw->gen < 6) {
3283 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3284 }
3285
3286 return index;
3287 } else if (brw->gen >= 8) {
3288 /* Store the offset in a GRF so we can send-from-GRF. */
3289 src_reg offset = src_reg(this, glsl_type::int_type);
3290 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3291 return offset;
3292 } else {
3293 int message_header_scale = brw->gen < 6 ? 16 : 1;
3294 return src_reg(reg_offset * message_header_scale);
3295 }
3296 }
3297
3298 /**
3299 * Emits an instruction before @inst to load the value named by @orig_src
3300 * from scratch space at @base_offset to @temp.
3301 *
3302 * @base_offset is measured in 32-byte units (the size of a register).
3303 */
3304 void
3305 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3306 dst_reg temp, src_reg orig_src,
3307 int base_offset)
3308 {
3309 int reg_offset = base_offset + orig_src.reg_offset;
3310 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3311 reg_offset);
3312
3313 emit_before(block, inst, SCRATCH_READ(temp, index));
3314 }
3315
3316 /**
3317 * Emits an instruction after @inst to store the value to be written
3318 * to @orig_dst to scratch space at @base_offset, from @temp.
3319 *
3320 * @base_offset is measured in 32-byte units (the size of a register).
3321 */
3322 void
3323 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3324 int base_offset)
3325 {
3326 int reg_offset = base_offset + inst->dst.reg_offset;
3327 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3328 reg_offset);
3329
3330 /* Create a temporary register to store *inst's result in.
3331 *
3332 * We have to be careful in MOVing from our temporary result register in
3333 * the scratch write. If we swizzle from channels of the temporary that
3334 * weren't initialized, it will confuse live interval analysis, which will
3335 * make spilling fail to make progress.
3336 */
3337 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3338 inst->dst.type),
3339 brw_swizzle_for_mask(inst->dst.writemask));
3340 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3341 inst->dst.writemask));
3342 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3343 write->predicate = inst->predicate;
3344 write->ir = inst->ir;
3345 write->annotation = inst->annotation;
3346 inst->insert_after(block, write);
3347
3348 inst->dst.file = temp.file;
3349 inst->dst.reg = temp.reg;
3350 inst->dst.reg_offset = temp.reg_offset;
3351 inst->dst.reladdr = NULL;
3352 }
3353
3354 /**
3355 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3356 * adds the scratch read(s) before \p inst. The function also checks for
3357 * recursive reladdr scratch accesses, issuing the corresponding scratch
3358 * loads and rewriting reladdr references accordingly.
3359 *
3360 * \return \p src if it did not require a scratch load, otherwise, the
3361 * register holding the result of the scratch load that the caller should
3362 * use to rewrite src.
3363 */
3364 src_reg
3365 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3366 vec4_instruction *inst, src_reg src)
3367 {
3368 /* Resolve recursive reladdr scratch access by calling ourselves
3369 * with src.reladdr
3370 */
3371 if (src.reladdr)
3372 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3373 *src.reladdr);
3374
3375 /* Now handle scratch access on src */
3376 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3377 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3378 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3379 src.reg = temp.reg;
3380 src.reg_offset = temp.reg_offset;
3381 src.reladdr = NULL;
3382 }
3383
3384 return src;
3385 }
3386
3387 /**
3388 * We can't generally support array access in GRF space, because a
3389 * single instruction's destination can only span 2 contiguous
3390 * registers. So, we send all GRF arrays that get variable index
3391 * access to scratch space.
3392 */
3393 void
3394 vec4_visitor::move_grf_array_access_to_scratch()
3395 {
3396 int scratch_loc[this->alloc.count];
3397 memset(scratch_loc, -1, sizeof(scratch_loc));
3398
3399 /* First, calculate the set of virtual GRFs that need to be punted
3400 * to scratch due to having any array access on them, and where in
3401 * scratch.
3402 */
3403 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3404 if (inst->dst.file == GRF && inst->dst.reladdr) {
3405 if (scratch_loc[inst->dst.reg] == -1) {
3406 scratch_loc[inst->dst.reg] = c->last_scratch;
3407 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3408 }
3409
3410 for (src_reg *iter = inst->dst.reladdr;
3411 iter->reladdr;
3412 iter = iter->reladdr) {
3413 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3414 scratch_loc[iter->reg] = c->last_scratch;
3415 c->last_scratch += this->alloc.sizes[iter->reg];
3416 }
3417 }
3418 }
3419
3420 for (int i = 0 ; i < 3; i++) {
3421 for (src_reg *iter = &inst->src[i];
3422 iter->reladdr;
3423 iter = iter->reladdr) {
3424 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3425 scratch_loc[iter->reg] = c->last_scratch;
3426 c->last_scratch += this->alloc.sizes[iter->reg];
3427 }
3428 }
3429 }
3430 }
3431
3432 /* Now, for anything that will be accessed through scratch, rewrite
3433 * it to load/store. Note that this is a _safe list walk, because
3434 * we may generate a new scratch_write instruction after the one
3435 * we're processing.
3436 */
3437 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3438 /* Set up the annotation tracking for new generated instructions. */
3439 base_ir = inst->ir;
3440 current_annotation = inst->annotation;
3441
3442 /* First handle scratch access on the dst. Notice we have to handle
3443 * the case where the dst's reladdr also points to scratch space.
3444 */
3445 if (inst->dst.reladdr)
3446 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3447 *inst->dst.reladdr);
3448
3449 /* Now that we have handled any (possibly recursive) reladdr scratch
3450 * accesses for dst we can safely do the scratch write for dst itself
3451 */
3452 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3453 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3454
3455 /* Now handle scratch access on any src. In this case, since inst->src[i]
3456 * already is a src_reg, we can just call emit_resolve_reladdr with
3457 * inst->src[i] and it will take care of handling scratch loads for
3458 * both src and src.reladdr (recursively).
3459 */
3460 for (int i = 0 ; i < 3; i++) {
3461 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3462 inst->src[i]);
3463 }
3464 }
3465 }
3466
3467 /**
3468 * Emits an instruction before @inst to load the value named by @orig_src
3469 * from the pull constant buffer (surface) at @base_offset to @temp.
3470 */
3471 void
3472 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3473 dst_reg temp, src_reg orig_src,
3474 int base_offset)
3475 {
3476 int reg_offset = base_offset + orig_src.reg_offset;
3477 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3478 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3479 reg_offset);
3480 vec4_instruction *load;
3481
3482 if (brw->gen >= 7) {
3483 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3484
3485 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3486 * Reserve space for the register.
3487 */
3488 if (brw->gen >= 9) {
3489 grf_offset.reg_offset++;
3490 alloc.sizes[grf_offset.reg] = 2;
3491 }
3492
3493 grf_offset.type = offset.type;
3494 emit_before(block, inst, MOV(grf_offset, offset));
3495
3496 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3497 temp, index, src_reg(grf_offset));
3498 load->mlen = 1;
3499 } else {
3500 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3501 temp, index, offset);
3502 load->base_mrf = 14;
3503 load->mlen = 1;
3504 }
3505 emit_before(block, inst, load);
3506 }
3507
3508 /**
3509 * Implements array access of uniforms by inserting a
3510 * PULL_CONSTANT_LOAD instruction.
3511 *
3512 * Unlike temporary GRF array access (where we don't support it due to
3513 * the difficulty of doing relative addressing on instruction
3514 * destinations), we could potentially do array access of uniforms
3515 * that were loaded in GRF space as push constants. In real-world
3516 * usage we've seen, though, the arrays being used are always larger
3517 * than we could load as push constants, so just always move all
3518 * uniform array access out to a pull constant buffer.
3519 */
3520 void
3521 vec4_visitor::move_uniform_array_access_to_pull_constants()
3522 {
3523 int pull_constant_loc[this->uniforms];
3524 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3525 bool nested_reladdr;
3526
3527 /* Walk through and find array access of uniforms. Put a copy of that
3528 * uniform in the pull constant buffer.
3529 *
3530 * Note that we don't move constant-indexed accesses to arrays. No
3531 * testing has been done of the performance impact of this choice.
3532 */
3533 do {
3534 nested_reladdr = false;
3535
3536 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3537 for (int i = 0 ; i < 3; i++) {
3538 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3539 continue;
3540
3541 int uniform = inst->src[i].reg;
3542
3543 if (inst->src[i].reladdr->reladdr)
3544 nested_reladdr = true; /* will need another pass */
3545
3546 /* If this array isn't already present in the pull constant buffer,
3547 * add it.
3548 */
3549 if (pull_constant_loc[uniform] == -1) {
3550 const gl_constant_value **values =
3551 &stage_prog_data->param[uniform * 4];
3552
3553 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3554
3555 assert(uniform < uniform_array_size);
3556 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3557 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3558 = values[j];
3559 }
3560 }
3561
3562 /* Set up the annotation tracking for new generated instructions. */
3563 base_ir = inst->ir;
3564 current_annotation = inst->annotation;
3565
3566 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3567
3568 emit_pull_constant_load(block, inst, temp, inst->src[i],
3569 pull_constant_loc[uniform]);
3570
3571 inst->src[i].file = temp.file;
3572 inst->src[i].reg = temp.reg;
3573 inst->src[i].reg_offset = temp.reg_offset;
3574 inst->src[i].reladdr = NULL;
3575 }
3576 }
3577 } while (nested_reladdr);
3578
3579 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3580 * no need to track them as larger-than-vec4 objects. This will be
3581 * relied on in cutting out unused uniform vectors from push
3582 * constants.
3583 */
3584 split_uniform_registers();
3585 }
3586
3587 void
3588 vec4_visitor::resolve_ud_negate(src_reg *reg)
3589 {
3590 if (reg->type != BRW_REGISTER_TYPE_UD ||
3591 !reg->negate)
3592 return;
3593
3594 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3595 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3596 *reg = temp;
3597 }
3598
3599 /**
3600 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3601 *
3602 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3603 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3604 */
3605 void
3606 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3607 {
3608 assert(brw->gen <= 5);
3609
3610 if (!rvalue->type->is_boolean())
3611 return;
3612
3613 src_reg and_result = src_reg(this, rvalue->type);
3614 src_reg neg_result = src_reg(this, rvalue->type);
3615 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3616 emit(MOV(dst_reg(neg_result), negate(and_result)));
3617 *reg = neg_result;
3618 }
3619
3620 vec4_visitor::vec4_visitor(struct brw_context *brw,
3621 struct brw_vec4_compile *c,
3622 struct gl_program *prog,
3623 const struct brw_vue_prog_key *key,
3624 struct brw_vue_prog_data *prog_data,
3625 struct gl_shader_program *shader_prog,
3626 gl_shader_stage stage,
3627 void *mem_ctx,
3628 bool no_spills,
3629 shader_time_shader_type st_base,
3630 shader_time_shader_type st_written,
3631 shader_time_shader_type st_reset)
3632 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3633 c(c),
3634 key(key),
3635 prog_data(prog_data),
3636 sanity_param_count(0),
3637 fail_msg(NULL),
3638 first_non_payload_grf(0),
3639 need_all_constants_in_pull_buffer(false),
3640 no_spills(no_spills),
3641 st_base(st_base),
3642 st_written(st_written),
3643 st_reset(st_reset)
3644 {
3645 this->mem_ctx = mem_ctx;
3646 this->failed = false;
3647
3648 this->base_ir = NULL;
3649 this->current_annotation = NULL;
3650 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3651
3652 this->variable_ht = hash_table_ctor(0,
3653 hash_table_pointer_hash,
3654 hash_table_pointer_compare);
3655
3656 this->virtual_grf_start = NULL;
3657 this->virtual_grf_end = NULL;
3658 this->live_intervals = NULL;
3659
3660 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3661
3662 this->uniforms = 0;
3663
3664 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3665 * at least one. See setup_uniforms() in brw_vec4.cpp.
3666 */
3667 this->uniform_array_size = 1;
3668 if (prog_data) {
3669 this->uniform_array_size =
3670 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3671 }
3672
3673 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3674 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3675 }
3676
3677 vec4_visitor::~vec4_visitor()
3678 {
3679 hash_table_dtor(this->variable_ht);
3680 }
3681
3682
3683 void
3684 vec4_visitor::fail(const char *format, ...)
3685 {
3686 va_list va;
3687 char *msg;
3688
3689 if (failed)
3690 return;
3691
3692 failed = true;
3693
3694 va_start(va, format);
3695 msg = ralloc_vasprintf(mem_ctx, format, va);
3696 va_end(va);
3697 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3698
3699 this->fail_msg = msg;
3700
3701 if (debug_enabled) {
3702 fprintf(stderr, "%s", msg);
3703 }
3704 }
3705
3706 } /* namespace brw */