i965/vec4: Emit MADs from (x + -(y * z)).
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(brw->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = 14;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = 13;
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 void
280 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
281 {
282 static enum opcode dot_opcodes[] = {
283 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
284 };
285
286 emit(dot_opcodes[elements - 2], dst, src0, src1);
287 }
288
289 src_reg
290 vec4_visitor::fix_3src_operand(src_reg src)
291 {
292 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
293 * able to use vertical stride of zero to replicate the vec4 uniform, like
294 *
295 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
296 *
297 * But you can't, since vertical stride is always four in three-source
298 * instructions. Instead, insert a MOV instruction to do the replication so
299 * that the three-source instruction can consume it.
300 */
301
302 /* The MOV is only needed if the source is a uniform or immediate. */
303 if (src.file != UNIFORM && src.file != IMM)
304 return src;
305
306 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
307 return src;
308
309 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
310 expanded.type = src.type;
311 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
312 return src_reg(expanded);
313 }
314
315 src_reg
316 vec4_visitor::fix_math_operand(src_reg src)
317 {
318 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
319 return src;
320
321 /* The gen6 math instruction ignores the source modifiers --
322 * swizzle, abs, negate, and at least some parts of the register
323 * region description.
324 *
325 * Rather than trying to enumerate all these cases, *always* expand the
326 * operand to a temp GRF for gen6.
327 *
328 * For gen7, keep the operand as-is, except if immediate, which gen7 still
329 * can't use.
330 */
331
332 if (brw->gen == 7 && src.file != IMM)
333 return src;
334
335 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
336 expanded.type = src.type;
337 emit(MOV(expanded, src));
338 return src_reg(expanded);
339 }
340
341 void
342 vec4_visitor::emit_math(enum opcode opcode,
343 const dst_reg &dst,
344 const src_reg &src0, const src_reg &src1)
345 {
346 vec4_instruction *math =
347 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
348
349 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
350 /* MATH on Gen6 must be align1, so we can't do writemasks. */
351 math->dst = dst_reg(this, glsl_type::vec4_type);
352 math->dst.type = dst.type;
353 emit(MOV(dst, src_reg(math->dst)));
354 } else if (brw->gen < 6) {
355 math->base_mrf = 1;
356 math->mlen = src1.file == BAD_FILE ? 1 : 2;
357 }
358 }
359
360 void
361 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
362 {
363 if (brw->gen < 7) {
364 unreachable("ir_unop_pack_half_2x16 should be lowered");
365 }
366
367 assert(dst.type == BRW_REGISTER_TYPE_UD);
368 assert(src0.type == BRW_REGISTER_TYPE_F);
369
370 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
371 *
372 * Because this instruction does not have a 16-bit floating-point type,
373 * the destination data type must be Word (W).
374 *
375 * The destination must be DWord-aligned and specify a horizontal stride
376 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
377 * each destination channel and the upper word is not modified.
378 *
379 * The above restriction implies that the f32to16 instruction must use
380 * align1 mode, because only in align1 mode is it possible to specify
381 * horizontal stride. We choose here to defy the hardware docs and emit
382 * align16 instructions.
383 *
384 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
385 * instructions. I was partially successful in that the code passed all
386 * tests. However, the code was dubiously correct and fragile, and the
387 * tests were not harsh enough to probe that frailty. Not trusting the
388 * code, I chose instead to remain in align16 mode in defiance of the hw
389 * docs).
390 *
391 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
392 * simulator, emitting a f32to16 in align16 mode with UD as destination
393 * data type is safe. The behavior differs from that specified in the PRM
394 * in that the upper word of each destination channel is cleared to 0.
395 */
396
397 dst_reg tmp_dst(this, glsl_type::uvec2_type);
398 src_reg tmp_src(tmp_dst);
399
400 #if 0
401 /* Verify the undocumented behavior on which the following instructions
402 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
403 * then the result of the bit-or instruction below will be incorrect.
404 *
405 * You should inspect the disasm output in order to verify that the MOV is
406 * not optimized away.
407 */
408 emit(MOV(tmp_dst, src_reg(0x12345678u)));
409 #endif
410
411 /* Give tmp the form below, where "." means untouched.
412 *
413 * w z y x w z y x
414 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
415 *
416 * That the upper word of each write-channel be 0 is required for the
417 * following bit-shift and bit-or instructions to work. Note that this
418 * relies on the undocumented hardware behavior mentioned above.
419 */
420 tmp_dst.writemask = WRITEMASK_XY;
421 emit(F32TO16(tmp_dst, src0));
422
423 /* Give the write-channels of dst the form:
424 * 0xhhhh0000
425 */
426 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
427 emit(SHL(dst, tmp_src, src_reg(16u)));
428
429 /* Finally, give the write-channels of dst the form of packHalf2x16's
430 * output:
431 * 0xhhhhllll
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
434 emit(OR(dst, src_reg(dst), tmp_src));
435 }
436
437 void
438 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_unpack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_F);
445 assert(src0.type == BRW_REGISTER_TYPE_UD);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the source data type must be Word (W). The destination type must be
451 * F (Float).
452 *
453 * To use W as the source data type, we must adjust horizontal strides,
454 * which is only possible in align1 mode. All my [chadv] attempts at
455 * emitting align1 instructions for unpackHalf2x16 failed to pass the
456 * Piglit tests, so I gave up.
457 *
458 * I've verified that, on gen7 hardware and the simulator, it is safe to
459 * emit f16to32 in align16 mode with UD as source data type.
460 */
461
462 dst_reg tmp_dst(this, glsl_type::uvec2_type);
463 src_reg tmp_src(tmp_dst);
464
465 tmp_dst.writemask = WRITEMASK_X;
466 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
467
468 tmp_dst.writemask = WRITEMASK_Y;
469 emit(SHR(tmp_dst, src0, src_reg(16u)));
470
471 dst.writemask = WRITEMASK_XY;
472 emit(F16TO32(dst, tmp_src));
473 }
474
475 void
476 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
477 {
478 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
479 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
480 * is not suitable to generate the shift values, but we can use the packed
481 * vector float and a type-converting MOV.
482 */
483 dst_reg shift(this, glsl_type::uvec4_type);
484 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
485
486 dst_reg shifted(this, glsl_type::uvec4_type);
487 src0.swizzle = BRW_SWIZZLE_XXXX;
488 emit(SHR(shifted, src0, src_reg(shift)));
489
490 shifted.type = BRW_REGISTER_TYPE_UB;
491 dst_reg f(this, glsl_type::vec4_type);
492 emit(MOV(f, src_reg(shifted)));
493
494 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
495 }
496
497 void
498 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
499 {
500 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
501 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
502 * is not suitable to generate the shift values, but we can use the packed
503 * vector float and a type-converting MOV.
504 */
505 dst_reg shift(this, glsl_type::uvec4_type);
506 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
507
508 dst_reg shifted(this, glsl_type::uvec4_type);
509 src0.swizzle = BRW_SWIZZLE_XXXX;
510 emit(SHR(shifted, src0, src_reg(shift)));
511
512 shifted.type = BRW_REGISTER_TYPE_B;
513 dst_reg f(this, glsl_type::vec4_type);
514 emit(MOV(f, src_reg(shifted)));
515
516 dst_reg scaled(this, glsl_type::vec4_type);
517 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
518
519 dst_reg max(this, glsl_type::vec4_type);
520 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
521 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
522 }
523
524 void
525 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
526 {
527 dst_reg saturated(this, glsl_type::vec4_type);
528 vec4_instruction *inst = emit(MOV(saturated, src0));
529 inst->saturate = true;
530
531 dst_reg scaled(this, glsl_type::vec4_type);
532 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
533
534 dst_reg rounded(this, glsl_type::vec4_type);
535 emit(RNDE(rounded, src_reg(scaled)));
536
537 dst_reg u(this, glsl_type::uvec4_type);
538 emit(MOV(u, src_reg(rounded)));
539
540 src_reg bytes(u);
541 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
542 }
543
544 void
545 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
546 {
547 dst_reg max(this, glsl_type::vec4_type);
548 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
549
550 dst_reg min(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
552
553 dst_reg scaled(this, glsl_type::vec4_type);
554 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
555
556 dst_reg rounded(this, glsl_type::vec4_type);
557 emit(RNDE(rounded, src_reg(scaled)));
558
559 dst_reg i(this, glsl_type::ivec4_type);
560 emit(MOV(i, src_reg(rounded)));
561
562 src_reg bytes(i);
563 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
564 }
565
566 void
567 vec4_visitor::visit_instructions(const exec_list *list)
568 {
569 foreach_in_list(ir_instruction, ir, list) {
570 base_ir = ir;
571 ir->accept(this);
572 }
573 }
574
575
576 static int
577 type_size(const struct glsl_type *type)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 if (type->is_matrix()) {
588 return type->matrix_columns;
589 } else {
590 /* Regardless of size of vector, it gets a vec4. This is bad
591 * packing for things like floats, but otherwise arrays become a
592 * mess. Hopefully a later pass over the code can pack scalars
593 * down if appropriate.
594 */
595 return 1;
596 }
597 case GLSL_TYPE_ARRAY:
598 assert(type->length > 0);
599 return type_size(type->fields.array) * type->length;
600 case GLSL_TYPE_STRUCT:
601 size = 0;
602 for (i = 0; i < type->length; i++) {
603 size += type_size(type->fields.structure[i].type);
604 }
605 return size;
606 case GLSL_TYPE_SAMPLER:
607 /* Samplers take up no register space, since they're baked in at
608 * link time.
609 */
610 return 0;
611 case GLSL_TYPE_ATOMIC_UINT:
612 return 0;
613 case GLSL_TYPE_IMAGE:
614 case GLSL_TYPE_VOID:
615 case GLSL_TYPE_ERROR:
616 case GLSL_TYPE_INTERFACE:
617 unreachable("not reached");
618 }
619
620 return 0;
621 }
622
623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
624 {
625 init();
626
627 this->file = GRF;
628 this->reg = v->alloc.allocate(type_size(type));
629
630 if (type->is_array() || type->is_record()) {
631 this->swizzle = BRW_SWIZZLE_NOOP;
632 } else {
633 this->swizzle = swizzle_for_size(type->vector_elements);
634 }
635
636 this->type = brw_type_for_base_type(type);
637 }
638
639 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
640 {
641 assert(size > 0);
642
643 init();
644
645 this->file = GRF;
646 this->reg = v->alloc.allocate(type_size(type) * size);
647
648 this->swizzle = BRW_SWIZZLE_NOOP;
649
650 this->type = brw_type_for_base_type(type);
651 }
652
653 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
654 {
655 init();
656
657 this->file = GRF;
658 this->reg = v->alloc.allocate(type_size(type));
659
660 if (type->is_array() || type->is_record()) {
661 this->writemask = WRITEMASK_XYZW;
662 } else {
663 this->writemask = (1 << type->vector_elements) - 1;
664 }
665
666 this->type = brw_type_for_base_type(type);
667 }
668
669 /* Our support for uniforms is piggy-backed on the struct
670 * gl_fragment_program, because that's where the values actually
671 * get stored, rather than in some global gl_shader_program uniform
672 * store.
673 */
674 void
675 vec4_visitor::setup_uniform_values(ir_variable *ir)
676 {
677 int namelen = strlen(ir->name);
678
679 /* The data for our (non-builtin) uniforms is stored in a series of
680 * gl_uniform_driver_storage structs for each subcomponent that
681 * glGetUniformLocation() could name. We know it's been set up in the same
682 * order we'd walk the type, so walk the list of storage and find anything
683 * with our name, or the prefix of a component that starts with our name.
684 */
685 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
686 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
687
688 if (strncmp(ir->name, storage->name, namelen) != 0 ||
689 (storage->name[namelen] != 0 &&
690 storage->name[namelen] != '.' &&
691 storage->name[namelen] != '[')) {
692 continue;
693 }
694
695 gl_constant_value *components = storage->storage;
696 unsigned vector_count = (MAX2(storage->array_elements, 1) *
697 storage->type->matrix_columns);
698
699 for (unsigned s = 0; s < vector_count; s++) {
700 assert(uniforms < uniform_array_size);
701 uniform_vector_size[uniforms] = storage->type->vector_elements;
702
703 int i;
704 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
705 stage_prog_data->param[uniforms * 4 + i] = components;
706 components++;
707 }
708 for (; i < 4; i++) {
709 static gl_constant_value zero = { 0.0 };
710 stage_prog_data->param[uniforms * 4 + i] = &zero;
711 }
712
713 uniforms++;
714 }
715 }
716 }
717
718 void
719 vec4_visitor::setup_uniform_clipplane_values()
720 {
721 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
722
723 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
724 assert(this->uniforms < uniform_array_size);
725 this->uniform_vector_size[this->uniforms] = 4;
726 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
727 this->userplane[i].type = BRW_REGISTER_TYPE_F;
728 for (int j = 0; j < 4; ++j) {
729 stage_prog_data->param[this->uniforms * 4 + j] =
730 (gl_constant_value *) &clip_planes[i][j];
731 }
732 ++this->uniforms;
733 }
734 }
735
736 /* Our support for builtin uniforms is even scarier than non-builtin.
737 * It sits on top of the PROG_STATE_VAR parameters that are
738 * automatically updated from GL context state.
739 */
740 void
741 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
742 {
743 const ir_state_slot *const slots = ir->get_state_slots();
744 assert(slots != NULL);
745
746 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
747 /* This state reference has already been setup by ir_to_mesa,
748 * but we'll get the same index back here. We can reference
749 * ParameterValues directly, since unlike brw_fs.cpp, we never
750 * add new state references during compile.
751 */
752 int index = _mesa_add_state_reference(this->prog->Parameters,
753 (gl_state_index *)slots[i].tokens);
754 gl_constant_value *values =
755 &this->prog->Parameters->ParameterValues[index][0];
756
757 assert(this->uniforms < uniform_array_size);
758 this->uniform_vector_size[this->uniforms] = 0;
759 /* Add each of the unique swizzled channels of the element.
760 * This will end up matching the size of the glsl_type of this field.
761 */
762 int last_swiz = -1;
763 for (unsigned int j = 0; j < 4; j++) {
764 int swiz = GET_SWZ(slots[i].swizzle, j);
765 last_swiz = swiz;
766
767 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
768 assert(this->uniforms < uniform_array_size);
769 if (swiz <= last_swiz)
770 this->uniform_vector_size[this->uniforms]++;
771 }
772 this->uniforms++;
773 }
774 }
775
776 dst_reg *
777 vec4_visitor::variable_storage(ir_variable *var)
778 {
779 return (dst_reg *)hash_table_find(this->variable_ht, var);
780 }
781
782 void
783 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
784 enum brw_predicate *predicate)
785 {
786 ir_expression *expr = ir->as_expression();
787
788 *predicate = BRW_PREDICATE_NORMAL;
789
790 if (expr && expr->operation != ir_binop_ubo_load) {
791 src_reg op[3];
792 vec4_instruction *inst;
793
794 assert(expr->get_num_operands() <= 3);
795 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
796 expr->operands[i]->accept(this);
797 op[i] = this->result;
798
799 resolve_ud_negate(&op[i]);
800 }
801
802 switch (expr->operation) {
803 case ir_unop_logic_not:
804 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
805 inst->conditional_mod = BRW_CONDITIONAL_Z;
806 break;
807
808 case ir_binop_logic_xor:
809 if (brw->gen <= 5) {
810 src_reg temp = src_reg(this, ir->type);
811 emit(XOR(dst_reg(temp), op[0], op[1]));
812 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
813 } else {
814 inst = emit(XOR(dst_null_d(), op[0], op[1]));
815 }
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 break;
818
819 case ir_binop_logic_or:
820 if (brw->gen <= 5) {
821 src_reg temp = src_reg(this, ir->type);
822 emit(OR(dst_reg(temp), op[0], op[1]));
823 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
824 } else {
825 inst = emit(OR(dst_null_d(), op[0], op[1]));
826 }
827 inst->conditional_mod = BRW_CONDITIONAL_NZ;
828 break;
829
830 case ir_binop_logic_and:
831 if (brw->gen <= 5) {
832 src_reg temp = src_reg(this, ir->type);
833 emit(AND(dst_reg(temp), op[0], op[1]));
834 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
835 } else {
836 inst = emit(AND(dst_null_d(), op[0], op[1]));
837 }
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 break;
840
841 case ir_unop_f2b:
842 if (brw->gen >= 6) {
843 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
844 } else {
845 inst = emit(MOV(dst_null_f(), op[0]));
846 inst->conditional_mod = BRW_CONDITIONAL_NZ;
847 }
848 break;
849
850 case ir_unop_i2b:
851 if (brw->gen >= 6) {
852 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
853 } else {
854 inst = emit(MOV(dst_null_d(), op[0]));
855 inst->conditional_mod = BRW_CONDITIONAL_NZ;
856 }
857 break;
858
859 case ir_binop_all_equal:
860 if (brw->gen <= 5) {
861 resolve_bool_comparison(expr->operands[0], &op[0]);
862 resolve_bool_comparison(expr->operands[1], &op[1]);
863 }
864 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
865 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
866 break;
867
868 case ir_binop_any_nequal:
869 if (brw->gen <= 5) {
870 resolve_bool_comparison(expr->operands[0], &op[0]);
871 resolve_bool_comparison(expr->operands[1], &op[1]);
872 }
873 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
874 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
875 break;
876
877 case ir_unop_any:
878 if (brw->gen <= 5) {
879 resolve_bool_comparison(expr->operands[0], &op[0]);
880 }
881 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
882 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
883 break;
884
885 case ir_binop_greater:
886 case ir_binop_gequal:
887 case ir_binop_less:
888 case ir_binop_lequal:
889 case ir_binop_equal:
890 case ir_binop_nequal:
891 if (brw->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 resolve_bool_comparison(expr->operands[1], &op[1]);
894 }
895 emit(CMP(dst_null_d(), op[0], op[1],
896 brw_conditional_for_comparison(expr->operation)));
897 break;
898
899 case ir_triop_csel: {
900 /* Expand the boolean condition into the flag register. */
901 inst = emit(MOV(dst_null_d(), op[0]));
902 inst->conditional_mod = BRW_CONDITIONAL_NZ;
903
904 /* Select which boolean to return. */
905 dst_reg temp(this, expr->operands[1]->type);
906 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
907 inst->predicate = BRW_PREDICATE_NORMAL;
908
909 /* Expand the result to a condition code. */
910 inst = emit(MOV(dst_null_d(), src_reg(temp)));
911 inst->conditional_mod = BRW_CONDITIONAL_NZ;
912 break;
913 }
914
915 default:
916 unreachable("not reached");
917 }
918 return;
919 }
920
921 ir->accept(this);
922
923 resolve_ud_negate(&this->result);
924
925 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
926 inst->conditional_mod = BRW_CONDITIONAL_NZ;
927 }
928
929 /**
930 * Emit a gen6 IF statement with the comparison folded into the IF
931 * instruction.
932 */
933 void
934 vec4_visitor::emit_if_gen6(ir_if *ir)
935 {
936 ir_expression *expr = ir->condition->as_expression();
937
938 if (expr && expr->operation != ir_binop_ubo_load) {
939 src_reg op[3];
940 dst_reg temp;
941
942 assert(expr->get_num_operands() <= 3);
943 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
944 expr->operands[i]->accept(this);
945 op[i] = this->result;
946 }
947
948 switch (expr->operation) {
949 case ir_unop_logic_not:
950 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
951 return;
952
953 case ir_binop_logic_xor:
954 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
955 return;
956
957 case ir_binop_logic_or:
958 temp = dst_reg(this, glsl_type::bool_type);
959 emit(OR(temp, op[0], op[1]));
960 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
961 return;
962
963 case ir_binop_logic_and:
964 temp = dst_reg(this, glsl_type::bool_type);
965 emit(AND(temp, op[0], op[1]));
966 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
967 return;
968
969 case ir_unop_f2b:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
971 return;
972
973 case ir_unop_i2b:
974 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
975 return;
976
977 case ir_binop_greater:
978 case ir_binop_gequal:
979 case ir_binop_less:
980 case ir_binop_lequal:
981 case ir_binop_equal:
982 case ir_binop_nequal:
983 emit(IF(op[0], op[1],
984 brw_conditional_for_comparison(expr->operation)));
985 return;
986
987 case ir_binop_all_equal:
988 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
989 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
990 return;
991
992 case ir_binop_any_nequal:
993 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
994 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
995 return;
996
997 case ir_unop_any:
998 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
999 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1000 return;
1001
1002 case ir_triop_csel: {
1003 /* Expand the boolean condition into the flag register. */
1004 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1005 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1006
1007 /* Select which boolean to return. */
1008 dst_reg temp(this, expr->operands[1]->type);
1009 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1010 inst->predicate = BRW_PREDICATE_NORMAL;
1011
1012 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1013 return;
1014 }
1015
1016 default:
1017 unreachable("not reached");
1018 }
1019 return;
1020 }
1021
1022 ir->condition->accept(this);
1023
1024 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_variable *ir)
1029 {
1030 dst_reg *reg = NULL;
1031
1032 if (variable_storage(ir))
1033 return;
1034
1035 switch (ir->data.mode) {
1036 case ir_var_shader_in:
1037 assert(ir->data.location != -1);
1038 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1039 break;
1040
1041 case ir_var_shader_out:
1042 assert(ir->data.location != -1);
1043 reg = new(mem_ctx) dst_reg(this, ir->type);
1044
1045 for (int i = 0; i < type_size(ir->type); i++) {
1046 output_reg[ir->data.location + i] = *reg;
1047 output_reg[ir->data.location + i].reg_offset = i;
1048 output_reg[ir->data.location + i].type =
1049 brw_type_for_base_type(ir->type->get_scalar_type());
1050 output_reg_annotation[ir->data.location + i] = ir->name;
1051 }
1052 break;
1053
1054 case ir_var_auto:
1055 case ir_var_temporary:
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057 break;
1058
1059 case ir_var_uniform:
1060 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1061
1062 /* Thanks to the lower_ubo_reference pass, we will see only
1063 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1064 * variables, so no need for them to be in variable_ht.
1065 *
1066 * Some uniforms, such as samplers and atomic counters, have no actual
1067 * storage, so we should ignore them.
1068 */
1069 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1070 return;
1071
1072 /* Track how big the whole uniform variable is, in case we need to put a
1073 * copy of its data into pull constants for array access.
1074 */
1075 assert(this->uniforms < uniform_array_size);
1076 this->uniform_size[this->uniforms] = type_size(ir->type);
1077
1078 if (!strncmp(ir->name, "gl_", 3)) {
1079 setup_builtin_uniform_values(ir);
1080 } else {
1081 setup_uniform_values(ir);
1082 }
1083 break;
1084
1085 case ir_var_system_value:
1086 reg = make_reg_for_system_value(ir);
1087 break;
1088
1089 default:
1090 unreachable("not reached");
1091 }
1092
1093 reg->type = brw_type_for_base_type(ir->type);
1094 hash_table_insert(this->variable_ht, reg, ir);
1095 }
1096
1097 void
1098 vec4_visitor::visit(ir_loop *ir)
1099 {
1100 /* We don't want debugging output to print the whole body of the
1101 * loop as the annotation.
1102 */
1103 this->base_ir = NULL;
1104
1105 emit(BRW_OPCODE_DO);
1106
1107 visit_instructions(&ir->body_instructions);
1108
1109 emit(BRW_OPCODE_WHILE);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop_jump *ir)
1114 {
1115 switch (ir->mode) {
1116 case ir_loop_jump::jump_break:
1117 emit(BRW_OPCODE_BREAK);
1118 break;
1119 case ir_loop_jump::jump_continue:
1120 emit(BRW_OPCODE_CONTINUE);
1121 break;
1122 }
1123 }
1124
1125
1126 void
1127 vec4_visitor::visit(ir_function_signature *)
1128 {
1129 unreachable("not reached");
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_function *ir)
1134 {
1135 /* Ignore function bodies other than main() -- we shouldn't see calls to
1136 * them since they should all be inlined.
1137 */
1138 if (strcmp(ir->name, "main") == 0) {
1139 const ir_function_signature *sig;
1140 exec_list empty;
1141
1142 sig = ir->matching_signature(NULL, &empty, false);
1143
1144 assert(sig);
1145
1146 visit_instructions(&sig->body);
1147 }
1148 }
1149
1150 bool
1151 vec4_visitor::try_emit_mad(ir_expression *ir)
1152 {
1153 /* 3-src instructions were introduced in gen6. */
1154 if (brw->gen < 6)
1155 return false;
1156
1157 /* MAD can only handle floating-point data. */
1158 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1159 return false;
1160
1161 ir_rvalue *nonmul = ir->operands[1];
1162 ir_expression *mul = ir->operands[0]->as_expression();
1163
1164 bool mul_negate = false;
1165 if (mul && mul->operation == ir_unop_neg) {
1166 mul = mul->operands[0]->as_expression();
1167 mul_negate = true;
1168 }
1169
1170 if (!mul || mul->operation != ir_binop_mul) {
1171 nonmul = ir->operands[0];
1172 mul = ir->operands[1]->as_expression();
1173
1174 if (mul && mul->operation == ir_unop_neg) {
1175 mul = mul->operands[0]->as_expression();
1176 mul_negate = true;
1177 }
1178
1179 if (!mul || mul->operation != ir_binop_mul)
1180 return false;
1181 }
1182
1183 nonmul->accept(this);
1184 src_reg src0 = fix_3src_operand(this->result);
1185
1186 mul->operands[0]->accept(this);
1187 src_reg src1 = fix_3src_operand(this->result);
1188 src1.negate ^= mul_negate;
1189
1190 mul->operands[1]->accept(this);
1191 src_reg src2 = fix_3src_operand(this->result);
1192
1193 this->result = src_reg(this, ir->type);
1194 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1195
1196 return true;
1197 }
1198
1199 bool
1200 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1201 {
1202 /* This optimization relies on CMP setting the destination to 0 when
1203 * false. Early hardware only sets the least significant bit, and
1204 * leaves the other bits undefined. So we can't use it.
1205 */
1206 if (brw->gen < 6)
1207 return false;
1208
1209 ir_expression *const cmp = ir->operands[0]->as_expression();
1210
1211 if (cmp == NULL)
1212 return false;
1213
1214 switch (cmp->operation) {
1215 case ir_binop_less:
1216 case ir_binop_greater:
1217 case ir_binop_lequal:
1218 case ir_binop_gequal:
1219 case ir_binop_equal:
1220 case ir_binop_nequal:
1221 break;
1222
1223 default:
1224 return false;
1225 }
1226
1227 cmp->operands[0]->accept(this);
1228 const src_reg cmp_src0 = this->result;
1229
1230 cmp->operands[1]->accept(this);
1231 const src_reg cmp_src1 = this->result;
1232
1233 this->result = src_reg(this, ir->type);
1234
1235 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1236 brw_conditional_for_comparison(cmp->operation)));
1237
1238 /* If the comparison is false, this->result will just happen to be zero.
1239 */
1240 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1241 this->result, src_reg(1.0f));
1242 inst->predicate = BRW_PREDICATE_NORMAL;
1243 inst->predicate_inverse = true;
1244
1245 return true;
1246 }
1247
1248 void
1249 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1250 src_reg src0, src_reg src1)
1251 {
1252 vec4_instruction *inst;
1253
1254 if (brw->gen >= 6) {
1255 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1256 inst->conditional_mod = conditionalmod;
1257 } else {
1258 emit(CMP(dst, src0, src1, conditionalmod));
1259
1260 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1261 inst->predicate = BRW_PREDICATE_NORMAL;
1262 }
1263 }
1264
1265 void
1266 vec4_visitor::emit_lrp(const dst_reg &dst,
1267 const src_reg &x, const src_reg &y, const src_reg &a)
1268 {
1269 if (brw->gen >= 6) {
1270 /* Note that the instruction's argument order is reversed from GLSL
1271 * and the IR.
1272 */
1273 emit(LRP(dst,
1274 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1275 } else {
1276 /* Earlier generations don't support three source operations, so we
1277 * need to emit x*(1-a) + y*a.
1278 */
1279 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1280 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1281 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1282 y_times_a.writemask = dst.writemask;
1283 one_minus_a.writemask = dst.writemask;
1284 x_times_one_minus_a.writemask = dst.writemask;
1285
1286 emit(MUL(y_times_a, y, a));
1287 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1288 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1289 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1290 }
1291 }
1292
1293 void
1294 vec4_visitor::visit(ir_expression *ir)
1295 {
1296 unsigned int operand;
1297 src_reg op[Elements(ir->operands)];
1298 vec4_instruction *inst;
1299
1300 if (ir->operation == ir_binop_add) {
1301 if (try_emit_mad(ir))
1302 return;
1303 }
1304
1305 if (ir->operation == ir_unop_b2f) {
1306 if (try_emit_b2f_of_compare(ir))
1307 return;
1308 }
1309
1310 /* Storage for our result. Ideally for an assignment we'd be using
1311 * the actual storage for the result here, instead.
1312 */
1313 dst_reg result_dst(this, ir->type);
1314 src_reg result_src(result_dst);
1315
1316 if (ir->operation == ir_triop_csel) {
1317 ir->operands[1]->accept(this);
1318 op[1] = this->result;
1319 ir->operands[2]->accept(this);
1320 op[2] = this->result;
1321
1322 enum brw_predicate predicate;
1323 emit_bool_to_cond_code(ir->operands[0], &predicate);
1324 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1325 inst->predicate = predicate;
1326 this->result = result_src;
1327 return;
1328 }
1329
1330 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1331 this->result.file = BAD_FILE;
1332 ir->operands[operand]->accept(this);
1333 if (this->result.file == BAD_FILE) {
1334 fprintf(stderr, "Failed to get tree for expression operand:\n");
1335 ir->operands[operand]->fprint(stderr);
1336 exit(1);
1337 }
1338 op[operand] = this->result;
1339
1340 /* Matrix expression operands should have been broken down to vector
1341 * operations already.
1342 */
1343 assert(!ir->operands[operand]->type->is_matrix());
1344 }
1345
1346 /* If nothing special happens, this is the result. */
1347 this->result = result_src;
1348
1349 switch (ir->operation) {
1350 case ir_unop_logic_not:
1351 emit(NOT(result_dst, op[0]));
1352 break;
1353 case ir_unop_neg:
1354 op[0].negate = !op[0].negate;
1355 emit(MOV(result_dst, op[0]));
1356 break;
1357 case ir_unop_abs:
1358 op[0].abs = true;
1359 op[0].negate = false;
1360 emit(MOV(result_dst, op[0]));
1361 break;
1362
1363 case ir_unop_sign:
1364 if (ir->type->is_float()) {
1365 /* AND(val, 0x80000000) gives the sign bit.
1366 *
1367 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1368 * zero.
1369 */
1370 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1371
1372 op[0].type = BRW_REGISTER_TYPE_UD;
1373 result_dst.type = BRW_REGISTER_TYPE_UD;
1374 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1375
1376 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1377 inst->predicate = BRW_PREDICATE_NORMAL;
1378
1379 this->result.type = BRW_REGISTER_TYPE_F;
1380 } else {
1381 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1382 * -> non-negative val generates 0x00000000.
1383 * Predicated OR sets 1 if val is positive.
1384 */
1385 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1386
1387 emit(ASR(result_dst, op[0], src_reg(31)));
1388
1389 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1390 inst->predicate = BRW_PREDICATE_NORMAL;
1391 }
1392 break;
1393
1394 case ir_unop_rcp:
1395 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1396 break;
1397
1398 case ir_unop_exp2:
1399 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1400 break;
1401 case ir_unop_log2:
1402 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1403 break;
1404 case ir_unop_exp:
1405 case ir_unop_log:
1406 unreachable("not reached: should be handled by ir_explog_to_explog2");
1407 case ir_unop_sin:
1408 case ir_unop_sin_reduced:
1409 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1410 break;
1411 case ir_unop_cos:
1412 case ir_unop_cos_reduced:
1413 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1414 break;
1415
1416 case ir_unop_dFdx:
1417 case ir_unop_dFdx_coarse:
1418 case ir_unop_dFdx_fine:
1419 case ir_unop_dFdy:
1420 case ir_unop_dFdy_coarse:
1421 case ir_unop_dFdy_fine:
1422 unreachable("derivatives not valid in vertex shader");
1423
1424 case ir_unop_bitfield_reverse:
1425 emit(BFREV(result_dst, op[0]));
1426 break;
1427 case ir_unop_bit_count:
1428 emit(CBIT(result_dst, op[0]));
1429 break;
1430 case ir_unop_find_msb: {
1431 src_reg temp = src_reg(this, glsl_type::uint_type);
1432
1433 inst = emit(FBH(dst_reg(temp), op[0]));
1434 inst->dst.writemask = WRITEMASK_XYZW;
1435
1436 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1437 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1438 * subtract the result from 31 to convert the MSB count into an LSB count.
1439 */
1440
1441 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1442 temp.swizzle = BRW_SWIZZLE_NOOP;
1443 emit(MOV(result_dst, temp));
1444
1445 src_reg src_tmp = src_reg(result_dst);
1446 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1447
1448 src_tmp.negate = true;
1449 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1450 inst->predicate = BRW_PREDICATE_NORMAL;
1451 break;
1452 }
1453 case ir_unop_find_lsb:
1454 emit(FBL(result_dst, op[0]));
1455 break;
1456 case ir_unop_saturate:
1457 inst = emit(MOV(result_dst, op[0]));
1458 inst->saturate = true;
1459 break;
1460
1461 case ir_unop_noise:
1462 unreachable("not reached: should be handled by lower_noise");
1463
1464 case ir_binop_add:
1465 emit(ADD(result_dst, op[0], op[1]));
1466 break;
1467 case ir_binop_sub:
1468 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1469
1470 case ir_binop_mul:
1471 if (brw->gen < 8 && ir->type->is_integer()) {
1472 /* For integer multiplication, the MUL uses the low 16 bits of one of
1473 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1474 * accumulates in the contribution of the upper 16 bits of that
1475 * operand. If we can determine that one of the args is in the low
1476 * 16 bits, though, we can just emit a single MUL.
1477 */
1478 if (ir->operands[0]->is_uint16_constant()) {
1479 if (brw->gen < 7)
1480 emit(MUL(result_dst, op[0], op[1]));
1481 else
1482 emit(MUL(result_dst, op[1], op[0]));
1483 } else if (ir->operands[1]->is_uint16_constant()) {
1484 if (brw->gen < 7)
1485 emit(MUL(result_dst, op[1], op[0]));
1486 else
1487 emit(MUL(result_dst, op[0], op[1]));
1488 } else {
1489 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1490
1491 emit(MUL(acc, op[0], op[1]));
1492 emit(MACH(dst_null_d(), op[0], op[1]));
1493 emit(MOV(result_dst, src_reg(acc)));
1494 }
1495 } else {
1496 emit(MUL(result_dst, op[0], op[1]));
1497 }
1498 break;
1499 case ir_binop_imul_high: {
1500 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1501
1502 emit(MUL(acc, op[0], op[1]));
1503 emit(MACH(result_dst, op[0], op[1]));
1504 break;
1505 }
1506 case ir_binop_div:
1507 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1508 assert(ir->type->is_integer());
1509 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1510 break;
1511 case ir_binop_carry: {
1512 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1513
1514 emit(ADDC(dst_null_ud(), op[0], op[1]));
1515 emit(MOV(result_dst, src_reg(acc)));
1516 break;
1517 }
1518 case ir_binop_borrow: {
1519 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1520
1521 emit(SUBB(dst_null_ud(), op[0], op[1]));
1522 emit(MOV(result_dst, src_reg(acc)));
1523 break;
1524 }
1525 case ir_binop_mod:
1526 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1527 assert(ir->type->is_integer());
1528 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1529 break;
1530
1531 case ir_binop_less:
1532 case ir_binop_greater:
1533 case ir_binop_lequal:
1534 case ir_binop_gequal:
1535 case ir_binop_equal:
1536 case ir_binop_nequal: {
1537 if (brw->gen <= 5) {
1538 resolve_bool_comparison(ir->operands[0], &op[0]);
1539 resolve_bool_comparison(ir->operands[1], &op[1]);
1540 }
1541 emit(CMP(result_dst, op[0], op[1],
1542 brw_conditional_for_comparison(ir->operation)));
1543 break;
1544 }
1545
1546 case ir_binop_all_equal:
1547 /* "==" operator producing a scalar boolean. */
1548 if (ir->operands[0]->type->is_vector() ||
1549 ir->operands[1]->type->is_vector()) {
1550 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1551 emit(MOV(result_dst, src_reg(0)));
1552 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1553 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1554 } else {
1555 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1556 }
1557 break;
1558 case ir_binop_any_nequal:
1559 /* "!=" operator producing a scalar boolean. */
1560 if (ir->operands[0]->type->is_vector() ||
1561 ir->operands[1]->type->is_vector()) {
1562 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1563
1564 emit(MOV(result_dst, src_reg(0)));
1565 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1566 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1567 } else {
1568 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1569 }
1570 break;
1571
1572 case ir_unop_any:
1573 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1574 emit(MOV(result_dst, src_reg(0)));
1575
1576 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1577 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1578 break;
1579
1580 case ir_binop_logic_xor:
1581 emit(XOR(result_dst, op[0], op[1]));
1582 break;
1583
1584 case ir_binop_logic_or:
1585 emit(OR(result_dst, op[0], op[1]));
1586 break;
1587
1588 case ir_binop_logic_and:
1589 emit(AND(result_dst, op[0], op[1]));
1590 break;
1591
1592 case ir_binop_dot:
1593 assert(ir->operands[0]->type->is_vector());
1594 assert(ir->operands[0]->type == ir->operands[1]->type);
1595 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1596 break;
1597
1598 case ir_unop_sqrt:
1599 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1600 break;
1601 case ir_unop_rsq:
1602 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1603 break;
1604
1605 case ir_unop_bitcast_i2f:
1606 case ir_unop_bitcast_u2f:
1607 this->result = op[0];
1608 this->result.type = BRW_REGISTER_TYPE_F;
1609 break;
1610
1611 case ir_unop_bitcast_f2i:
1612 this->result = op[0];
1613 this->result.type = BRW_REGISTER_TYPE_D;
1614 break;
1615
1616 case ir_unop_bitcast_f2u:
1617 this->result = op[0];
1618 this->result.type = BRW_REGISTER_TYPE_UD;
1619 break;
1620
1621 case ir_unop_i2f:
1622 case ir_unop_i2u:
1623 case ir_unop_u2i:
1624 case ir_unop_u2f:
1625 case ir_unop_f2i:
1626 case ir_unop_f2u:
1627 emit(MOV(result_dst, op[0]));
1628 break;
1629 case ir_unop_b2i:
1630 emit(AND(result_dst, op[0], src_reg(1)));
1631 break;
1632 case ir_unop_b2f:
1633 if (brw->gen <= 5) {
1634 resolve_bool_comparison(ir->operands[0], &op[0]);
1635 }
1636 op[0].type = BRW_REGISTER_TYPE_D;
1637 result_dst.type = BRW_REGISTER_TYPE_D;
1638 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1639 result_dst.type = BRW_REGISTER_TYPE_F;
1640 break;
1641 case ir_unop_f2b:
1642 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1643 break;
1644 case ir_unop_i2b:
1645 emit(AND(result_dst, op[0], src_reg(1)));
1646 break;
1647
1648 case ir_unop_trunc:
1649 emit(RNDZ(result_dst, op[0]));
1650 break;
1651 case ir_unop_ceil: {
1652 src_reg tmp = src_reg(this, ir->type);
1653 op[0].negate = !op[0].negate;
1654 emit(RNDD(dst_reg(tmp), op[0]));
1655 tmp.negate = true;
1656 emit(MOV(result_dst, tmp));
1657 }
1658 break;
1659 case ir_unop_floor:
1660 inst = emit(RNDD(result_dst, op[0]));
1661 break;
1662 case ir_unop_fract:
1663 inst = emit(FRC(result_dst, op[0]));
1664 break;
1665 case ir_unop_round_even:
1666 emit(RNDE(result_dst, op[0]));
1667 break;
1668
1669 case ir_binop_min:
1670 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1671 break;
1672 case ir_binop_max:
1673 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1674 break;
1675
1676 case ir_binop_pow:
1677 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1678 break;
1679
1680 case ir_unop_bit_not:
1681 inst = emit(NOT(result_dst, op[0]));
1682 break;
1683 case ir_binop_bit_and:
1684 inst = emit(AND(result_dst, op[0], op[1]));
1685 break;
1686 case ir_binop_bit_xor:
1687 inst = emit(XOR(result_dst, op[0], op[1]));
1688 break;
1689 case ir_binop_bit_or:
1690 inst = emit(OR(result_dst, op[0], op[1]));
1691 break;
1692
1693 case ir_binop_lshift:
1694 inst = emit(SHL(result_dst, op[0], op[1]));
1695 break;
1696
1697 case ir_binop_rshift:
1698 if (ir->type->base_type == GLSL_TYPE_INT)
1699 inst = emit(ASR(result_dst, op[0], op[1]));
1700 else
1701 inst = emit(SHR(result_dst, op[0], op[1]));
1702 break;
1703
1704 case ir_binop_bfm:
1705 emit(BFI1(result_dst, op[0], op[1]));
1706 break;
1707
1708 case ir_binop_ubo_load: {
1709 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1710 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1711 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1712 src_reg offset;
1713
1714 /* Now, load the vector from that offset. */
1715 assert(ir->type->is_vector() || ir->type->is_scalar());
1716
1717 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1718 packed_consts.type = result.type;
1719 src_reg surf_index;
1720
1721 if (const_uniform_block) {
1722 /* The block index is a constant, so just emit the binding table entry
1723 * as an immediate.
1724 */
1725 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1726 const_uniform_block->value.u[0]);
1727 } else {
1728 /* The block index is not a constant. Evaluate the index expression
1729 * per-channel and add the base UBO index; the generator will select
1730 * a value from any live channel.
1731 */
1732 surf_index = src_reg(this, glsl_type::uint_type);
1733 emit(ADD(dst_reg(surf_index), op[0],
1734 src_reg(prog_data->base.binding_table.ubo_start)));
1735
1736 /* Assume this may touch any UBO. It would be nice to provide
1737 * a tighter bound, but the array information is already lowered away.
1738 */
1739 brw_mark_surface_used(&prog_data->base,
1740 prog_data->base.binding_table.ubo_start +
1741 shader_prog->NumUniformBlocks - 1);
1742 }
1743
1744 if (const_offset_ir) {
1745 if (brw->gen >= 8) {
1746 /* Store the offset in a GRF so we can send-from-GRF. */
1747 offset = src_reg(this, glsl_type::int_type);
1748 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1749 } else {
1750 /* Immediates are fine on older generations since they'll be moved
1751 * to a (potentially fake) MRF at the generator level.
1752 */
1753 offset = src_reg(const_offset / 16);
1754 }
1755 } else {
1756 offset = src_reg(this, glsl_type::uint_type);
1757 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1758 }
1759
1760 if (brw->gen >= 7) {
1761 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1762 grf_offset.type = offset.type;
1763
1764 emit(MOV(grf_offset, offset));
1765
1766 vec4_instruction *pull =
1767 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1768 dst_reg(packed_consts),
1769 surf_index,
1770 src_reg(grf_offset)));
1771 pull->mlen = 1;
1772 } else {
1773 vec4_instruction *pull =
1774 emit(new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1775 dst_reg(packed_consts),
1776 surf_index,
1777 offset));
1778 pull->base_mrf = 14;
1779 pull->mlen = 1;
1780 }
1781
1782 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1783 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1784 const_offset % 16 / 4,
1785 const_offset % 16 / 4,
1786 const_offset % 16 / 4);
1787
1788 /* UBO bools are any nonzero int. We need to convert them to use the
1789 * value of true stored in ctx->Const.UniformBooleanTrue.
1790 */
1791 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1792 emit(CMP(result_dst, packed_consts, src_reg(0u),
1793 BRW_CONDITIONAL_NZ));
1794 } else {
1795 emit(MOV(result_dst, packed_consts));
1796 }
1797 break;
1798 }
1799
1800 case ir_binop_vector_extract:
1801 unreachable("should have been lowered by vec_index_to_cond_assign");
1802
1803 case ir_triop_fma:
1804 op[0] = fix_3src_operand(op[0]);
1805 op[1] = fix_3src_operand(op[1]);
1806 op[2] = fix_3src_operand(op[2]);
1807 /* Note that the instruction's argument order is reversed from GLSL
1808 * and the IR.
1809 */
1810 emit(MAD(result_dst, op[2], op[1], op[0]));
1811 break;
1812
1813 case ir_triop_lrp:
1814 emit_lrp(result_dst, op[0], op[1], op[2]);
1815 break;
1816
1817 case ir_triop_csel:
1818 unreachable("already handled above");
1819 break;
1820
1821 case ir_triop_bfi:
1822 op[0] = fix_3src_operand(op[0]);
1823 op[1] = fix_3src_operand(op[1]);
1824 op[2] = fix_3src_operand(op[2]);
1825 emit(BFI2(result_dst, op[0], op[1], op[2]));
1826 break;
1827
1828 case ir_triop_bitfield_extract:
1829 op[0] = fix_3src_operand(op[0]);
1830 op[1] = fix_3src_operand(op[1]);
1831 op[2] = fix_3src_operand(op[2]);
1832 /* Note that the instruction's argument order is reversed from GLSL
1833 * and the IR.
1834 */
1835 emit(BFE(result_dst, op[2], op[1], op[0]));
1836 break;
1837
1838 case ir_triop_vector_insert:
1839 unreachable("should have been lowered by lower_vector_insert");
1840
1841 case ir_quadop_bitfield_insert:
1842 unreachable("not reached: should be handled by "
1843 "bitfield_insert_to_bfm_bfi\n");
1844
1845 case ir_quadop_vector:
1846 unreachable("not reached: should be handled by lower_quadop_vector");
1847
1848 case ir_unop_pack_half_2x16:
1849 emit_pack_half_2x16(result_dst, op[0]);
1850 break;
1851 case ir_unop_unpack_half_2x16:
1852 emit_unpack_half_2x16(result_dst, op[0]);
1853 break;
1854 case ir_unop_unpack_unorm_4x8:
1855 emit_unpack_unorm_4x8(result_dst, op[0]);
1856 break;
1857 case ir_unop_unpack_snorm_4x8:
1858 emit_unpack_snorm_4x8(result_dst, op[0]);
1859 break;
1860 case ir_unop_pack_unorm_4x8:
1861 emit_pack_unorm_4x8(result_dst, op[0]);
1862 break;
1863 case ir_unop_pack_snorm_4x8:
1864 emit_pack_snorm_4x8(result_dst, op[0]);
1865 break;
1866 case ir_unop_pack_snorm_2x16:
1867 case ir_unop_pack_unorm_2x16:
1868 case ir_unop_unpack_snorm_2x16:
1869 case ir_unop_unpack_unorm_2x16:
1870 unreachable("not reached: should be handled by lower_packing_builtins");
1871 case ir_unop_unpack_half_2x16_split_x:
1872 case ir_unop_unpack_half_2x16_split_y:
1873 case ir_binop_pack_half_2x16_split:
1874 case ir_unop_interpolate_at_centroid:
1875 case ir_binop_interpolate_at_sample:
1876 case ir_binop_interpolate_at_offset:
1877 unreachable("not reached: should not occur in vertex shader");
1878 case ir_binop_ldexp:
1879 unreachable("not reached: should be handled by ldexp_to_arith()");
1880 }
1881 }
1882
1883
1884 void
1885 vec4_visitor::visit(ir_swizzle *ir)
1886 {
1887 src_reg src;
1888 int i = 0;
1889 int swizzle[4];
1890
1891 /* Note that this is only swizzles in expressions, not those on the left
1892 * hand side of an assignment, which do write masking. See ir_assignment
1893 * for that.
1894 */
1895
1896 ir->val->accept(this);
1897 src = this->result;
1898 assert(src.file != BAD_FILE);
1899
1900 for (i = 0; i < ir->type->vector_elements; i++) {
1901 switch (i) {
1902 case 0:
1903 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1904 break;
1905 case 1:
1906 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1907 break;
1908 case 2:
1909 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1910 break;
1911 case 3:
1912 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1913 break;
1914 }
1915 }
1916 for (; i < 4; i++) {
1917 /* Replicate the last channel out. */
1918 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1919 }
1920
1921 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1922
1923 this->result = src;
1924 }
1925
1926 void
1927 vec4_visitor::visit(ir_dereference_variable *ir)
1928 {
1929 const struct glsl_type *type = ir->type;
1930 dst_reg *reg = variable_storage(ir->var);
1931
1932 if (!reg) {
1933 fail("Failed to find variable storage for %s\n", ir->var->name);
1934 this->result = src_reg(brw_null_reg());
1935 return;
1936 }
1937
1938 this->result = src_reg(*reg);
1939
1940 /* System values get their swizzle from the dst_reg writemask */
1941 if (ir->var->data.mode == ir_var_system_value)
1942 return;
1943
1944 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1945 this->result.swizzle = swizzle_for_size(type->vector_elements);
1946 }
1947
1948
1949 int
1950 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1951 {
1952 /* Under normal circumstances array elements are stored consecutively, so
1953 * the stride is equal to the size of the array element.
1954 */
1955 return type_size(ir->type);
1956 }
1957
1958
1959 void
1960 vec4_visitor::visit(ir_dereference_array *ir)
1961 {
1962 ir_constant *constant_index;
1963 src_reg src;
1964 int array_stride = compute_array_stride(ir);
1965
1966 constant_index = ir->array_index->constant_expression_value();
1967
1968 ir->array->accept(this);
1969 src = this->result;
1970
1971 if (constant_index) {
1972 src.reg_offset += constant_index->value.i[0] * array_stride;
1973 } else {
1974 /* Variable index array dereference. It eats the "vec4" of the
1975 * base of the array and an index that offsets the Mesa register
1976 * index.
1977 */
1978 ir->array_index->accept(this);
1979
1980 src_reg index_reg;
1981
1982 if (array_stride == 1) {
1983 index_reg = this->result;
1984 } else {
1985 index_reg = src_reg(this, glsl_type::int_type);
1986
1987 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1988 }
1989
1990 if (src.reladdr) {
1991 src_reg temp = src_reg(this, glsl_type::int_type);
1992
1993 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1994
1995 index_reg = temp;
1996 }
1997
1998 src.reladdr = ralloc(mem_ctx, src_reg);
1999 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2000 }
2001
2002 /* If the type is smaller than a vec4, replicate the last channel out. */
2003 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2004 src.swizzle = swizzle_for_size(ir->type->vector_elements);
2005 else
2006 src.swizzle = BRW_SWIZZLE_NOOP;
2007 src.type = brw_type_for_base_type(ir->type);
2008
2009 this->result = src;
2010 }
2011
2012 void
2013 vec4_visitor::visit(ir_dereference_record *ir)
2014 {
2015 unsigned int i;
2016 const glsl_type *struct_type = ir->record->type;
2017 int offset = 0;
2018
2019 ir->record->accept(this);
2020
2021 for (i = 0; i < struct_type->length; i++) {
2022 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2023 break;
2024 offset += type_size(struct_type->fields.structure[i].type);
2025 }
2026
2027 /* If the type is smaller than a vec4, replicate the last channel out. */
2028 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2029 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2030 else
2031 this->result.swizzle = BRW_SWIZZLE_NOOP;
2032 this->result.type = brw_type_for_base_type(ir->type);
2033
2034 this->result.reg_offset += offset;
2035 }
2036
2037 /**
2038 * We want to be careful in assignment setup to hit the actual storage
2039 * instead of potentially using a temporary like we might with the
2040 * ir_dereference handler.
2041 */
2042 static dst_reg
2043 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2044 {
2045 /* The LHS must be a dereference. If the LHS is a variable indexed array
2046 * access of a vector, it must be separated into a series conditional moves
2047 * before reaching this point (see ir_vec_index_to_cond_assign).
2048 */
2049 assert(ir->as_dereference());
2050 ir_dereference_array *deref_array = ir->as_dereference_array();
2051 if (deref_array) {
2052 assert(!deref_array->array->type->is_vector());
2053 }
2054
2055 /* Use the rvalue deref handler for the most part. We'll ignore
2056 * swizzles in it and write swizzles using writemask, though.
2057 */
2058 ir->accept(v);
2059 return dst_reg(v->result);
2060 }
2061
2062 void
2063 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2064 const struct glsl_type *type,
2065 enum brw_predicate predicate)
2066 {
2067 if (type->base_type == GLSL_TYPE_STRUCT) {
2068 for (unsigned int i = 0; i < type->length; i++) {
2069 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2070 }
2071 return;
2072 }
2073
2074 if (type->is_array()) {
2075 for (unsigned int i = 0; i < type->length; i++) {
2076 emit_block_move(dst, src, type->fields.array, predicate);
2077 }
2078 return;
2079 }
2080
2081 if (type->is_matrix()) {
2082 const struct glsl_type *vec_type;
2083
2084 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2085 type->vector_elements, 1);
2086
2087 for (int i = 0; i < type->matrix_columns; i++) {
2088 emit_block_move(dst, src, vec_type, predicate);
2089 }
2090 return;
2091 }
2092
2093 assert(type->is_scalar() || type->is_vector());
2094
2095 dst->type = brw_type_for_base_type(type);
2096 src->type = dst->type;
2097
2098 dst->writemask = (1 << type->vector_elements) - 1;
2099
2100 src->swizzle = swizzle_for_size(type->vector_elements);
2101
2102 vec4_instruction *inst = emit(MOV(*dst, *src));
2103 inst->predicate = predicate;
2104
2105 dst->reg_offset++;
2106 src->reg_offset++;
2107 }
2108
2109
2110 /* If the RHS processing resulted in an instruction generating a
2111 * temporary value, and it would be easy to rewrite the instruction to
2112 * generate its result right into the LHS instead, do so. This ends
2113 * up reliably removing instructions where it can be tricky to do so
2114 * later without real UD chain information.
2115 */
2116 bool
2117 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2118 dst_reg dst,
2119 src_reg src,
2120 vec4_instruction *pre_rhs_inst,
2121 vec4_instruction *last_rhs_inst)
2122 {
2123 /* This could be supported, but it would take more smarts. */
2124 if (ir->condition)
2125 return false;
2126
2127 if (pre_rhs_inst == last_rhs_inst)
2128 return false; /* No instructions generated to work with. */
2129
2130 /* Make sure the last instruction generated our source reg. */
2131 if (src.file != GRF ||
2132 src.file != last_rhs_inst->dst.file ||
2133 src.reg != last_rhs_inst->dst.reg ||
2134 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2135 src.reladdr ||
2136 src.abs ||
2137 src.negate ||
2138 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2139 return false;
2140
2141 /* Check that that last instruction fully initialized the channels
2142 * we want to use, in the order we want to use them. We could
2143 * potentially reswizzle the operands of many instructions so that
2144 * we could handle out of order channels, but don't yet.
2145 */
2146
2147 for (unsigned i = 0; i < 4; i++) {
2148 if (dst.writemask & (1 << i)) {
2149 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2150 return false;
2151
2152 if (BRW_GET_SWZ(src.swizzle, i) != i)
2153 return false;
2154 }
2155 }
2156
2157 /* Success! Rewrite the instruction. */
2158 last_rhs_inst->dst.file = dst.file;
2159 last_rhs_inst->dst.reg = dst.reg;
2160 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2161 last_rhs_inst->dst.reladdr = dst.reladdr;
2162 last_rhs_inst->dst.writemask &= dst.writemask;
2163
2164 return true;
2165 }
2166
2167 void
2168 vec4_visitor::visit(ir_assignment *ir)
2169 {
2170 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2171 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2172
2173 if (!ir->lhs->type->is_scalar() &&
2174 !ir->lhs->type->is_vector()) {
2175 ir->rhs->accept(this);
2176 src_reg src = this->result;
2177
2178 if (ir->condition) {
2179 emit_bool_to_cond_code(ir->condition, &predicate);
2180 }
2181
2182 /* emit_block_move doesn't account for swizzles in the source register.
2183 * This should be ok, since the source register is a structure or an
2184 * array, and those can't be swizzled. But double-check to be sure.
2185 */
2186 assert(src.swizzle ==
2187 (ir->rhs->type->is_matrix()
2188 ? swizzle_for_size(ir->rhs->type->vector_elements)
2189 : BRW_SWIZZLE_NOOP));
2190
2191 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2192 return;
2193 }
2194
2195 /* Now we're down to just a scalar/vector with writemasks. */
2196 int i;
2197
2198 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2199 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2200
2201 ir->rhs->accept(this);
2202
2203 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2204
2205 src_reg src = this->result;
2206
2207 int swizzles[4];
2208 int first_enabled_chan = 0;
2209 int src_chan = 0;
2210
2211 assert(ir->lhs->type->is_vector() ||
2212 ir->lhs->type->is_scalar());
2213 dst.writemask = ir->write_mask;
2214
2215 for (int i = 0; i < 4; i++) {
2216 if (dst.writemask & (1 << i)) {
2217 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2218 break;
2219 }
2220 }
2221
2222 /* Swizzle a small RHS vector into the channels being written.
2223 *
2224 * glsl ir treats write_mask as dictating how many channels are
2225 * present on the RHS while in our instructions we need to make
2226 * those channels appear in the slots of the vec4 they're written to.
2227 */
2228 for (int i = 0; i < 4; i++) {
2229 if (dst.writemask & (1 << i))
2230 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2231 else
2232 swizzles[i] = first_enabled_chan;
2233 }
2234 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2235 swizzles[2], swizzles[3]);
2236
2237 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2238 return;
2239 }
2240
2241 if (ir->condition) {
2242 emit_bool_to_cond_code(ir->condition, &predicate);
2243 }
2244
2245 for (i = 0; i < type_size(ir->lhs->type); i++) {
2246 vec4_instruction *inst = emit(MOV(dst, src));
2247 inst->predicate = predicate;
2248
2249 dst.reg_offset++;
2250 src.reg_offset++;
2251 }
2252 }
2253
2254 void
2255 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2256 {
2257 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2258 foreach_in_list(ir_constant, field_value, &ir->components) {
2259 emit_constant_values(dst, field_value);
2260 }
2261 return;
2262 }
2263
2264 if (ir->type->is_array()) {
2265 for (unsigned int i = 0; i < ir->type->length; i++) {
2266 emit_constant_values(dst, ir->array_elements[i]);
2267 }
2268 return;
2269 }
2270
2271 if (ir->type->is_matrix()) {
2272 for (int i = 0; i < ir->type->matrix_columns; i++) {
2273 float *vec = &ir->value.f[i * ir->type->vector_elements];
2274
2275 for (int j = 0; j < ir->type->vector_elements; j++) {
2276 dst->writemask = 1 << j;
2277 dst->type = BRW_REGISTER_TYPE_F;
2278
2279 emit(MOV(*dst, src_reg(vec[j])));
2280 }
2281 dst->reg_offset++;
2282 }
2283 return;
2284 }
2285
2286 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2287
2288 for (int i = 0; i < ir->type->vector_elements; i++) {
2289 if (!(remaining_writemask & (1 << i)))
2290 continue;
2291
2292 dst->writemask = 1 << i;
2293 dst->type = brw_type_for_base_type(ir->type);
2294
2295 /* Find other components that match the one we're about to
2296 * write. Emits fewer instructions for things like vec4(0.5,
2297 * 1.5, 1.5, 1.5).
2298 */
2299 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2300 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2301 if (ir->value.b[i] == ir->value.b[j])
2302 dst->writemask |= (1 << j);
2303 } else {
2304 /* u, i, and f storage all line up, so no need for a
2305 * switch case for comparing each type.
2306 */
2307 if (ir->value.u[i] == ir->value.u[j])
2308 dst->writemask |= (1 << j);
2309 }
2310 }
2311
2312 switch (ir->type->base_type) {
2313 case GLSL_TYPE_FLOAT:
2314 emit(MOV(*dst, src_reg(ir->value.f[i])));
2315 break;
2316 case GLSL_TYPE_INT:
2317 emit(MOV(*dst, src_reg(ir->value.i[i])));
2318 break;
2319 case GLSL_TYPE_UINT:
2320 emit(MOV(*dst, src_reg(ir->value.u[i])));
2321 break;
2322 case GLSL_TYPE_BOOL:
2323 emit(MOV(*dst,
2324 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2325 : 0)));
2326 break;
2327 default:
2328 unreachable("Non-float/uint/int/bool constant");
2329 }
2330
2331 remaining_writemask &= ~dst->writemask;
2332 }
2333 dst->reg_offset++;
2334 }
2335
2336 void
2337 vec4_visitor::visit(ir_constant *ir)
2338 {
2339 dst_reg dst = dst_reg(this, ir->type);
2340 this->result = src_reg(dst);
2341
2342 emit_constant_values(&dst, ir);
2343 }
2344
2345 void
2346 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2347 {
2348 ir_dereference *deref = static_cast<ir_dereference *>(
2349 ir->actual_parameters.get_head());
2350 ir_variable *location = deref->variable_referenced();
2351 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2352 location->data.binding);
2353
2354 /* Calculate the surface offset */
2355 src_reg offset(this, glsl_type::uint_type);
2356 ir_dereference_array *deref_array = deref->as_dereference_array();
2357 if (deref_array) {
2358 deref_array->array_index->accept(this);
2359
2360 src_reg tmp(this, glsl_type::uint_type);
2361 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2362 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2363 } else {
2364 offset = location->data.atomic.offset;
2365 }
2366
2367 /* Emit the appropriate machine instruction */
2368 const char *callee = ir->callee->function_name();
2369 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2370
2371 if (!strcmp("__intrinsic_atomic_read", callee)) {
2372 emit_untyped_surface_read(surf_index, dst, offset);
2373
2374 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2375 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2376 src_reg(), src_reg());
2377
2378 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2379 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2380 src_reg(), src_reg());
2381 }
2382 }
2383
2384 void
2385 vec4_visitor::visit(ir_call *ir)
2386 {
2387 const char *callee = ir->callee->function_name();
2388
2389 if (!strcmp("__intrinsic_atomic_read", callee) ||
2390 !strcmp("__intrinsic_atomic_increment", callee) ||
2391 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2392 visit_atomic_counter_intrinsic(ir);
2393 } else {
2394 unreachable("Unsupported intrinsic.");
2395 }
2396 }
2397
2398 src_reg
2399 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2400 {
2401 vec4_instruction *inst =
2402 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2403 dst_reg(this, glsl_type::uvec4_type));
2404 inst->base_mrf = 2;
2405 inst->mlen = 1;
2406 inst->src[1] = sampler;
2407
2408 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2409 int param_base = inst->base_mrf;
2410 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2411 int zero_mask = 0xf & ~coord_mask;
2412
2413 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2414 coordinate));
2415
2416 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2417 src_reg(0)));
2418
2419 emit(inst);
2420 return src_reg(inst->dst);
2421 }
2422
2423 static bool
2424 is_high_sampler(struct brw_context *brw, src_reg sampler)
2425 {
2426 if (brw->gen < 8 && !brw->is_haswell)
2427 return false;
2428
2429 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2430 }
2431
2432 void
2433 vec4_visitor::visit(ir_texture *ir)
2434 {
2435 uint32_t sampler =
2436 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2437
2438 ir_rvalue *nonconst_sampler_index =
2439 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2440
2441 /* Handle non-constant sampler array indexing */
2442 src_reg sampler_reg;
2443 if (nonconst_sampler_index) {
2444 /* The highest sampler which may be used by this operation is
2445 * the last element of the array. Mark it here, because the generator
2446 * doesn't have enough information to determine the bound.
2447 */
2448 uint32_t array_size = ir->sampler->as_dereference_array()
2449 ->array->type->array_size();
2450
2451 uint32_t max_used = sampler + array_size - 1;
2452 if (ir->op == ir_tg4 && brw->gen < 8) {
2453 max_used += prog_data->base.binding_table.gather_texture_start;
2454 } else {
2455 max_used += prog_data->base.binding_table.texture_start;
2456 }
2457
2458 brw_mark_surface_used(&prog_data->base, max_used);
2459
2460 /* Emit code to evaluate the actual indexing expression */
2461 nonconst_sampler_index->accept(this);
2462 dst_reg temp(this, glsl_type::uint_type);
2463 emit(ADD(temp, this->result, src_reg(sampler)))
2464 ->force_writemask_all = true;
2465 sampler_reg = src_reg(temp);
2466 } else {
2467 /* Single sampler, or constant array index; the indexing expression
2468 * is just an immediate.
2469 */
2470 sampler_reg = src_reg(sampler);
2471 }
2472
2473 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2474 * emitting anything other than setting up the constant result.
2475 */
2476 if (ir->op == ir_tg4) {
2477 ir_constant *chan = ir->lod_info.component->as_constant();
2478 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2479 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2480 dst_reg result(this, ir->type);
2481 this->result = src_reg(result);
2482 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2483 return;
2484 }
2485 }
2486
2487 /* Should be lowered by do_lower_texture_projection */
2488 assert(!ir->projector);
2489
2490 /* Should be lowered */
2491 assert(!ir->offset || !ir->offset->type->is_array());
2492
2493 /* Generate code to compute all the subexpression trees. This has to be
2494 * done before loading any values into MRFs for the sampler message since
2495 * generating these values may involve SEND messages that need the MRFs.
2496 */
2497 src_reg coordinate;
2498 if (ir->coordinate) {
2499 ir->coordinate->accept(this);
2500 coordinate = this->result;
2501 }
2502
2503 src_reg shadow_comparitor;
2504 if (ir->shadow_comparitor) {
2505 ir->shadow_comparitor->accept(this);
2506 shadow_comparitor = this->result;
2507 }
2508
2509 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2510 src_reg offset_value;
2511 if (has_nonconstant_offset) {
2512 ir->offset->accept(this);
2513 offset_value = src_reg(this->result);
2514 }
2515
2516 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2517 src_reg lod, dPdx, dPdy, sample_index, mcs;
2518 switch (ir->op) {
2519 case ir_tex:
2520 lod = src_reg(0.0f);
2521 lod_type = glsl_type::float_type;
2522 break;
2523 case ir_txf:
2524 case ir_txl:
2525 case ir_txs:
2526 ir->lod_info.lod->accept(this);
2527 lod = this->result;
2528 lod_type = ir->lod_info.lod->type;
2529 break;
2530 case ir_query_levels:
2531 lod = src_reg(0);
2532 lod_type = glsl_type::int_type;
2533 break;
2534 case ir_txf_ms:
2535 ir->lod_info.sample_index->accept(this);
2536 sample_index = this->result;
2537 sample_index_type = ir->lod_info.sample_index->type;
2538
2539 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2540 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2541 else
2542 mcs = src_reg(0u);
2543 break;
2544 case ir_txd:
2545 ir->lod_info.grad.dPdx->accept(this);
2546 dPdx = this->result;
2547
2548 ir->lod_info.grad.dPdy->accept(this);
2549 dPdy = this->result;
2550
2551 lod_type = ir->lod_info.grad.dPdx->type;
2552 break;
2553 case ir_txb:
2554 case ir_lod:
2555 case ir_tg4:
2556 break;
2557 }
2558
2559 enum opcode opcode;
2560 switch (ir->op) {
2561 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2562 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2563 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2564 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2565 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2566 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2567 case ir_tg4: opcode = has_nonconstant_offset
2568 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2569 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2570 case ir_txb:
2571 unreachable("TXB is not valid for vertex shaders.");
2572 case ir_lod:
2573 unreachable("LOD is not valid for vertex shaders.");
2574 default:
2575 unreachable("Unrecognized tex op");
2576 }
2577
2578 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2579 opcode, dst_reg(this, ir->type));
2580
2581 if (ir->offset != NULL && !has_nonconstant_offset) {
2582 inst->offset =
2583 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2584 ir->offset->type->vector_elements);
2585 }
2586
2587 /* Stuff the channel select bits in the top of the texture offset */
2588 if (ir->op == ir_tg4)
2589 inst->offset |= gather_channel(ir, sampler) << 16;
2590
2591 /* The message header is necessary for:
2592 * - Gen4 (always)
2593 * - Gen9+ for selecting SIMD4x2
2594 * - Texel offsets
2595 * - Gather channel selection
2596 * - Sampler indices too large to fit in a 4-bit value.
2597 */
2598 inst->header_present =
2599 brw->gen < 5 || brw->gen >= 9 ||
2600 inst->offset != 0 || ir->op == ir_tg4 ||
2601 is_high_sampler(brw, sampler_reg);
2602 inst->base_mrf = 2;
2603 inst->mlen = inst->header_present + 1; /* always at least one */
2604 inst->shadow_compare = ir->shadow_comparitor != NULL;
2605
2606 inst->src[1] = sampler_reg;
2607
2608 /* MRF for the first parameter */
2609 int param_base = inst->base_mrf + inst->header_present;
2610
2611 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2612 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2613 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2614 } else {
2615 /* Load the coordinate */
2616 /* FINISHME: gl_clamp_mask and saturate */
2617 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2618 int zero_mask = 0xf & ~coord_mask;
2619
2620 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2621 coordinate));
2622
2623 if (zero_mask != 0) {
2624 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2625 src_reg(0)));
2626 }
2627 /* Load the shadow comparitor */
2628 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2629 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2630 WRITEMASK_X),
2631 shadow_comparitor));
2632 inst->mlen++;
2633 }
2634
2635 /* Load the LOD info */
2636 if (ir->op == ir_tex || ir->op == ir_txl) {
2637 int mrf, writemask;
2638 if (brw->gen >= 5) {
2639 mrf = param_base + 1;
2640 if (ir->shadow_comparitor) {
2641 writemask = WRITEMASK_Y;
2642 /* mlen already incremented */
2643 } else {
2644 writemask = WRITEMASK_X;
2645 inst->mlen++;
2646 }
2647 } else /* brw->gen == 4 */ {
2648 mrf = param_base;
2649 writemask = WRITEMASK_W;
2650 }
2651 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2652 } else if (ir->op == ir_txf) {
2653 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2654 } else if (ir->op == ir_txf_ms) {
2655 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2656 sample_index));
2657 if (brw->gen >= 7) {
2658 /* MCS data is in the first channel of `mcs`, but we need to get it into
2659 * the .y channel of the second vec4 of params, so replicate .x across
2660 * the whole vec4 and then mask off everything except .y
2661 */
2662 mcs.swizzle = BRW_SWIZZLE_XXXX;
2663 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2664 mcs));
2665 }
2666 inst->mlen++;
2667 } else if (ir->op == ir_txd) {
2668 const glsl_type *type = lod_type;
2669
2670 if (brw->gen >= 5) {
2671 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2672 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2673 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2674 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2675 inst->mlen++;
2676
2677 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2678 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2679 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2680 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2681 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2682 inst->mlen++;
2683
2684 if (ir->shadow_comparitor) {
2685 emit(MOV(dst_reg(MRF, param_base + 2,
2686 ir->shadow_comparitor->type, WRITEMASK_Z),
2687 shadow_comparitor));
2688 }
2689 }
2690 } else /* brw->gen == 4 */ {
2691 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2692 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2693 inst->mlen += 2;
2694 }
2695 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2696 if (ir->shadow_comparitor) {
2697 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2698 shadow_comparitor));
2699 }
2700
2701 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2702 offset_value));
2703 inst->mlen++;
2704 }
2705 }
2706
2707 emit(inst);
2708
2709 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2710 * spec requires layers.
2711 */
2712 if (ir->op == ir_txs) {
2713 glsl_type const *type = ir->sampler->type;
2714 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2715 type->sampler_array) {
2716 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2717 writemask(inst->dst, WRITEMASK_Z),
2718 src_reg(inst->dst), src_reg(6));
2719 }
2720 }
2721
2722 if (brw->gen == 6 && ir->op == ir_tg4) {
2723 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2724 }
2725
2726 swizzle_result(ir, src_reg(inst->dst), sampler);
2727 }
2728
2729 /**
2730 * Apply workarounds for Gen6 gather with UINT/SINT
2731 */
2732 void
2733 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2734 {
2735 if (!wa)
2736 return;
2737
2738 int width = (wa & WA_8BIT) ? 8 : 16;
2739 dst_reg dst_f = dst;
2740 dst_f.type = BRW_REGISTER_TYPE_F;
2741
2742 /* Convert from UNORM to UINT */
2743 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2744 emit(MOV(dst, src_reg(dst_f)));
2745
2746 if (wa & WA_SIGN) {
2747 /* Reinterpret the UINT value as a signed INT value by
2748 * shifting the sign bit into place, then shifting back
2749 * preserving sign.
2750 */
2751 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2752 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2753 }
2754 }
2755
2756 /**
2757 * Set up the gather channel based on the swizzle, for gather4.
2758 */
2759 uint32_t
2760 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2761 {
2762 ir_constant *chan = ir->lod_info.component->as_constant();
2763 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2764 switch (swiz) {
2765 case SWIZZLE_X: return 0;
2766 case SWIZZLE_Y:
2767 /* gather4 sampler is broken for green channel on RG32F --
2768 * we must ask for blue instead.
2769 */
2770 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2771 return 2;
2772 return 1;
2773 case SWIZZLE_Z: return 2;
2774 case SWIZZLE_W: return 3;
2775 default:
2776 unreachable("Not reached"); /* zero, one swizzles handled already */
2777 }
2778 }
2779
2780 void
2781 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2782 {
2783 int s = key->tex.swizzles[sampler];
2784
2785 this->result = src_reg(this, ir->type);
2786 dst_reg swizzled_result(this->result);
2787
2788 if (ir->op == ir_query_levels) {
2789 /* # levels is in .w */
2790 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2791 emit(MOV(swizzled_result, orig_val));
2792 return;
2793 }
2794
2795 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2796 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2797 emit(MOV(swizzled_result, orig_val));
2798 return;
2799 }
2800
2801
2802 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2803 int swizzle[4] = {0};
2804
2805 for (int i = 0; i < 4; i++) {
2806 switch (GET_SWZ(s, i)) {
2807 case SWIZZLE_ZERO:
2808 zero_mask |= (1 << i);
2809 break;
2810 case SWIZZLE_ONE:
2811 one_mask |= (1 << i);
2812 break;
2813 default:
2814 copy_mask |= (1 << i);
2815 swizzle[i] = GET_SWZ(s, i);
2816 break;
2817 }
2818 }
2819
2820 if (copy_mask) {
2821 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2822 swizzled_result.writemask = copy_mask;
2823 emit(MOV(swizzled_result, orig_val));
2824 }
2825
2826 if (zero_mask) {
2827 swizzled_result.writemask = zero_mask;
2828 emit(MOV(swizzled_result, src_reg(0.0f)));
2829 }
2830
2831 if (one_mask) {
2832 swizzled_result.writemask = one_mask;
2833 emit(MOV(swizzled_result, src_reg(1.0f)));
2834 }
2835 }
2836
2837 void
2838 vec4_visitor::visit(ir_return *)
2839 {
2840 unreachable("not reached");
2841 }
2842
2843 void
2844 vec4_visitor::visit(ir_discard *)
2845 {
2846 unreachable("not reached");
2847 }
2848
2849 void
2850 vec4_visitor::visit(ir_if *ir)
2851 {
2852 /* Don't point the annotation at the if statement, because then it plus
2853 * the then and else blocks get printed.
2854 */
2855 this->base_ir = ir->condition;
2856
2857 if (brw->gen == 6) {
2858 emit_if_gen6(ir);
2859 } else {
2860 enum brw_predicate predicate;
2861 emit_bool_to_cond_code(ir->condition, &predicate);
2862 emit(IF(predicate));
2863 }
2864
2865 visit_instructions(&ir->then_instructions);
2866
2867 if (!ir->else_instructions.is_empty()) {
2868 this->base_ir = ir->condition;
2869 emit(BRW_OPCODE_ELSE);
2870
2871 visit_instructions(&ir->else_instructions);
2872 }
2873
2874 this->base_ir = ir->condition;
2875 emit(BRW_OPCODE_ENDIF);
2876 }
2877
2878 void
2879 vec4_visitor::visit(ir_emit_vertex *)
2880 {
2881 unreachable("not reached");
2882 }
2883
2884 void
2885 vec4_visitor::visit(ir_end_primitive *)
2886 {
2887 unreachable("not reached");
2888 }
2889
2890 void
2891 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2892 dst_reg dst, src_reg offset,
2893 src_reg src0, src_reg src1)
2894 {
2895 unsigned mlen = 0;
2896
2897 /* Set the atomic operation offset. */
2898 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2899 mlen++;
2900
2901 /* Set the atomic operation arguments. */
2902 if (src0.file != BAD_FILE) {
2903 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2904 mlen++;
2905 }
2906
2907 if (src1.file != BAD_FILE) {
2908 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2909 mlen++;
2910 }
2911
2912 /* Emit the instruction. Note that this maps to the normal SIMD8
2913 * untyped atomic message on Ivy Bridge, but that's OK because
2914 * unused channels will be masked out.
2915 */
2916 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2917 src_reg(atomic_op), src_reg(surf_index));
2918 inst->base_mrf = 0;
2919 inst->mlen = mlen;
2920 }
2921
2922 void
2923 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2924 src_reg offset)
2925 {
2926 /* Set the surface read offset. */
2927 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2928
2929 /* Emit the instruction. Note that this maps to the normal SIMD8
2930 * untyped surface read message, but that's OK because unused
2931 * channels will be masked out.
2932 */
2933 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2934 dst, src_reg(surf_index));
2935 inst->base_mrf = 0;
2936 inst->mlen = 1;
2937 }
2938
2939 void
2940 vec4_visitor::emit_ndc_computation()
2941 {
2942 /* Get the position */
2943 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2944
2945 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2946 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2947 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2948
2949 current_annotation = "NDC";
2950 dst_reg ndc_w = ndc;
2951 ndc_w.writemask = WRITEMASK_W;
2952 src_reg pos_w = pos;
2953 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2954 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2955
2956 dst_reg ndc_xyz = ndc;
2957 ndc_xyz.writemask = WRITEMASK_XYZ;
2958
2959 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2960 }
2961
2962 void
2963 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2964 {
2965 if (brw->gen < 6 &&
2966 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2967 key->userclip_active || brw->has_negative_rhw_bug)) {
2968 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2969 dst_reg header1_w = header1;
2970 header1_w.writemask = WRITEMASK_W;
2971
2972 emit(MOV(header1, 0u));
2973
2974 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2975 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2976
2977 current_annotation = "Point size";
2978 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2979 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2980 }
2981
2982 if (key->userclip_active) {
2983 current_annotation = "Clipping flags";
2984 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2985 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2986
2987 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2988 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2989 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2990
2991 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2992 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2993 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2994 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2995 }
2996
2997 /* i965 clipping workaround:
2998 * 1) Test for -ve rhw
2999 * 2) If set,
3000 * set ndc = (0,0,0,0)
3001 * set ucp[6] = 1
3002 *
3003 * Later, clipping will detect ucp[6] and ensure the primitive is
3004 * clipped against all fixed planes.
3005 */
3006 if (brw->has_negative_rhw_bug) {
3007 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3008 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3009 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3010 vec4_instruction *inst;
3011 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3012 inst->predicate = BRW_PREDICATE_NORMAL;
3013 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3014 inst->predicate = BRW_PREDICATE_NORMAL;
3015 }
3016
3017 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3018 } else if (brw->gen < 6) {
3019 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3020 } else {
3021 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3022 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3023 dst_reg reg_w = reg;
3024 reg_w.writemask = WRITEMASK_W;
3025 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3026 }
3027 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3028 dst_reg reg_y = reg;
3029 reg_y.writemask = WRITEMASK_Y;
3030 reg_y.type = BRW_REGISTER_TYPE_D;
3031 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3032 }
3033 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3034 dst_reg reg_z = reg;
3035 reg_z.writemask = WRITEMASK_Z;
3036 reg_z.type = BRW_REGISTER_TYPE_D;
3037 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3038 }
3039 }
3040 }
3041
3042 void
3043 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3044 {
3045 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3046 *
3047 * "If a linked set of shaders forming the vertex stage contains no
3048 * static write to gl_ClipVertex or gl_ClipDistance, but the
3049 * application has requested clipping against user clip planes through
3050 * the API, then the coordinate written to gl_Position is used for
3051 * comparison against the user clip planes."
3052 *
3053 * This function is only called if the shader didn't write to
3054 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3055 * if the user wrote to it; otherwise we use gl_Position.
3056 */
3057 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3058 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3059 clip_vertex = VARYING_SLOT_POS;
3060 }
3061
3062 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3063 ++i) {
3064 reg.writemask = 1 << i;
3065 emit(DP4(reg,
3066 src_reg(output_reg[clip_vertex]),
3067 src_reg(this->userplane[i + offset])));
3068 }
3069 }
3070
3071 vec4_instruction *
3072 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3073 {
3074 assert (varying < VARYING_SLOT_MAX);
3075 reg.type = output_reg[varying].type;
3076 current_annotation = output_reg_annotation[varying];
3077 /* Copy the register, saturating if necessary */
3078 return emit(MOV(reg, src_reg(output_reg[varying])));
3079 }
3080
3081 void
3082 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3083 {
3084 reg.type = BRW_REGISTER_TYPE_F;
3085
3086 switch (varying) {
3087 case VARYING_SLOT_PSIZ:
3088 {
3089 /* PSIZ is always in slot 0, and is coupled with other flags. */
3090 current_annotation = "indices, point width, clip flags";
3091 emit_psiz_and_flags(reg);
3092 break;
3093 }
3094 case BRW_VARYING_SLOT_NDC:
3095 current_annotation = "NDC";
3096 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3097 break;
3098 case VARYING_SLOT_POS:
3099 current_annotation = "gl_Position";
3100 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3101 break;
3102 case VARYING_SLOT_EDGE:
3103 /* This is present when doing unfilled polygons. We're supposed to copy
3104 * the edge flag from the user-provided vertex array
3105 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3106 * of that attribute (starts as 1.0f). This is then used in clipping to
3107 * determine which edges should be drawn as wireframe.
3108 */
3109 current_annotation = "edge flag";
3110 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3111 glsl_type::float_type, WRITEMASK_XYZW))));
3112 break;
3113 case BRW_VARYING_SLOT_PAD:
3114 /* No need to write to this slot */
3115 break;
3116 case VARYING_SLOT_COL0:
3117 case VARYING_SLOT_COL1:
3118 case VARYING_SLOT_BFC0:
3119 case VARYING_SLOT_BFC1: {
3120 /* These built-in varyings are only supported in compatibility mode,
3121 * and we only support GS in core profile. So, this must be a vertex
3122 * shader.
3123 */
3124 assert(stage == MESA_SHADER_VERTEX);
3125 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3126 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3127 inst->saturate = true;
3128 break;
3129 }
3130
3131 default:
3132 emit_generic_urb_slot(reg, varying);
3133 break;
3134 }
3135 }
3136
3137 static int
3138 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3139 {
3140 if (brw->gen >= 6) {
3141 /* URB data written (does not include the message header reg) must
3142 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3143 * section 5.4.3.2.2: URB_INTERLEAVED.
3144 *
3145 * URB entries are allocated on a multiple of 1024 bits, so an
3146 * extra 128 bits written here to make the end align to 256 is
3147 * no problem.
3148 */
3149 if ((mlen % 2) != 1)
3150 mlen++;
3151 }
3152
3153 return mlen;
3154 }
3155
3156
3157 /**
3158 * Generates the VUE payload plus the necessary URB write instructions to
3159 * output it.
3160 *
3161 * The VUE layout is documented in Volume 2a.
3162 */
3163 void
3164 vec4_visitor::emit_vertex()
3165 {
3166 /* MRF 0 is reserved for the debugger, so start with message header
3167 * in MRF 1.
3168 */
3169 int base_mrf = 1;
3170 int mrf = base_mrf;
3171 /* In the process of generating our URB write message contents, we
3172 * may need to unspill a register or load from an array. Those
3173 * reads would use MRFs 14-15.
3174 */
3175 int max_usable_mrf = 13;
3176
3177 /* The following assertion verifies that max_usable_mrf causes an
3178 * even-numbered amount of URB write data, which will meet gen6's
3179 * requirements for length alignment.
3180 */
3181 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3182
3183 /* First mrf is the g0-based message header containing URB handles and
3184 * such.
3185 */
3186 emit_urb_write_header(mrf++);
3187
3188 if (brw->gen < 6) {
3189 emit_ndc_computation();
3190 }
3191
3192 /* Lower legacy ff and ClipVertex clipping to clip distances */
3193 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3194 current_annotation = "user clip distances";
3195
3196 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3197 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3198
3199 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3200 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3201 }
3202
3203 /* We may need to split this up into several URB writes, so do them in a
3204 * loop.
3205 */
3206 int slot = 0;
3207 bool complete = false;
3208 do {
3209 /* URB offset is in URB row increments, and each of our MRFs is half of
3210 * one of those, since we're doing interleaved writes.
3211 */
3212 int offset = slot / 2;
3213
3214 mrf = base_mrf + 1;
3215 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3216 emit_urb_slot(dst_reg(MRF, mrf++),
3217 prog_data->vue_map.slot_to_varying[slot]);
3218
3219 /* If this was max_usable_mrf, we can't fit anything more into this
3220 * URB WRITE.
3221 */
3222 if (mrf > max_usable_mrf) {
3223 slot++;
3224 break;
3225 }
3226 }
3227
3228 complete = slot >= prog_data->vue_map.num_slots;
3229 current_annotation = "URB write";
3230 vec4_instruction *inst = emit_urb_write_opcode(complete);
3231 inst->base_mrf = base_mrf;
3232 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3233 inst->offset += offset;
3234 } while(!complete);
3235 }
3236
3237
3238 src_reg
3239 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3240 src_reg *reladdr, int reg_offset)
3241 {
3242 /* Because we store the values to scratch interleaved like our
3243 * vertex data, we need to scale the vec4 index by 2.
3244 */
3245 int message_header_scale = 2;
3246
3247 /* Pre-gen6, the message header uses byte offsets instead of vec4
3248 * (16-byte) offset units.
3249 */
3250 if (brw->gen < 6)
3251 message_header_scale *= 16;
3252
3253 if (reladdr) {
3254 src_reg index = src_reg(this, glsl_type::int_type);
3255
3256 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3257 src_reg(reg_offset)));
3258 emit_before(block, inst, MUL(dst_reg(index), index,
3259 src_reg(message_header_scale)));
3260
3261 return index;
3262 } else {
3263 return src_reg(reg_offset * message_header_scale);
3264 }
3265 }
3266
3267 src_reg
3268 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3269 src_reg *reladdr, int reg_offset)
3270 {
3271 if (reladdr) {
3272 src_reg index = src_reg(this, glsl_type::int_type);
3273
3274 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3275 src_reg(reg_offset)));
3276
3277 /* Pre-gen6, the message header uses byte offsets instead of vec4
3278 * (16-byte) offset units.
3279 */
3280 if (brw->gen < 6) {
3281 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3282 }
3283
3284 return index;
3285 } else if (brw->gen >= 8) {
3286 /* Store the offset in a GRF so we can send-from-GRF. */
3287 src_reg offset = src_reg(this, glsl_type::int_type);
3288 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3289 return offset;
3290 } else {
3291 int message_header_scale = brw->gen < 6 ? 16 : 1;
3292 return src_reg(reg_offset * message_header_scale);
3293 }
3294 }
3295
3296 /**
3297 * Emits an instruction before @inst to load the value named by @orig_src
3298 * from scratch space at @base_offset to @temp.
3299 *
3300 * @base_offset is measured in 32-byte units (the size of a register).
3301 */
3302 void
3303 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3304 dst_reg temp, src_reg orig_src,
3305 int base_offset)
3306 {
3307 int reg_offset = base_offset + orig_src.reg_offset;
3308 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3309 reg_offset);
3310
3311 emit_before(block, inst, SCRATCH_READ(temp, index));
3312 }
3313
3314 /**
3315 * Emits an instruction after @inst to store the value to be written
3316 * to @orig_dst to scratch space at @base_offset, from @temp.
3317 *
3318 * @base_offset is measured in 32-byte units (the size of a register).
3319 */
3320 void
3321 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3322 int base_offset)
3323 {
3324 int reg_offset = base_offset + inst->dst.reg_offset;
3325 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3326 reg_offset);
3327
3328 /* Create a temporary register to store *inst's result in.
3329 *
3330 * We have to be careful in MOVing from our temporary result register in
3331 * the scratch write. If we swizzle from channels of the temporary that
3332 * weren't initialized, it will confuse live interval analysis, which will
3333 * make spilling fail to make progress.
3334 */
3335 src_reg temp = src_reg(this, glsl_type::vec4_type);
3336 temp.type = inst->dst.type;
3337 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3338 int swizzles[4];
3339 for (int i = 0; i < 4; i++)
3340 if (inst->dst.writemask & (1 << i))
3341 swizzles[i] = i;
3342 else
3343 swizzles[i] = first_writemask_chan;
3344 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3345 swizzles[2], swizzles[3]);
3346
3347 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3348 inst->dst.writemask));
3349 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3350 write->predicate = inst->predicate;
3351 write->ir = inst->ir;
3352 write->annotation = inst->annotation;
3353 inst->insert_after(block, write);
3354
3355 inst->dst.file = temp.file;
3356 inst->dst.reg = temp.reg;
3357 inst->dst.reg_offset = temp.reg_offset;
3358 inst->dst.reladdr = NULL;
3359 }
3360
3361 /**
3362 * We can't generally support array access in GRF space, because a
3363 * single instruction's destination can only span 2 contiguous
3364 * registers. So, we send all GRF arrays that get variable index
3365 * access to scratch space.
3366 */
3367 void
3368 vec4_visitor::move_grf_array_access_to_scratch()
3369 {
3370 int scratch_loc[this->alloc.count];
3371 memset(scratch_loc, -1, sizeof(scratch_loc));
3372
3373 /* First, calculate the set of virtual GRFs that need to be punted
3374 * to scratch due to having any array access on them, and where in
3375 * scratch.
3376 */
3377 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3378 if (inst->dst.file == GRF && inst->dst.reladdr &&
3379 scratch_loc[inst->dst.reg] == -1) {
3380 scratch_loc[inst->dst.reg] = c->last_scratch;
3381 c->last_scratch += this->alloc.sizes[inst->dst.reg];
3382 }
3383
3384 for (int i = 0 ; i < 3; i++) {
3385 src_reg *src = &inst->src[i];
3386
3387 if (src->file == GRF && src->reladdr &&
3388 scratch_loc[src->reg] == -1) {
3389 scratch_loc[src->reg] = c->last_scratch;
3390 c->last_scratch += this->alloc.sizes[src->reg];
3391 }
3392 }
3393 }
3394
3395 /* Now, for anything that will be accessed through scratch, rewrite
3396 * it to load/store. Note that this is a _safe list walk, because
3397 * we may generate a new scratch_write instruction after the one
3398 * we're processing.
3399 */
3400 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3401 /* Set up the annotation tracking for new generated instructions. */
3402 base_ir = inst->ir;
3403 current_annotation = inst->annotation;
3404
3405 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3406 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3407 }
3408
3409 for (int i = 0 ; i < 3; i++) {
3410 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3411 continue;
3412
3413 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3414
3415 emit_scratch_read(block, inst, temp, inst->src[i],
3416 scratch_loc[inst->src[i].reg]);
3417
3418 inst->src[i].file = temp.file;
3419 inst->src[i].reg = temp.reg;
3420 inst->src[i].reg_offset = temp.reg_offset;
3421 inst->src[i].reladdr = NULL;
3422 }
3423 }
3424 }
3425
3426 /**
3427 * Emits an instruction before @inst to load the value named by @orig_src
3428 * from the pull constant buffer (surface) at @base_offset to @temp.
3429 */
3430 void
3431 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3432 dst_reg temp, src_reg orig_src,
3433 int base_offset)
3434 {
3435 int reg_offset = base_offset + orig_src.reg_offset;
3436 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3437 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3438 reg_offset);
3439 vec4_instruction *load;
3440
3441 if (brw->gen >= 7) {
3442 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3443 grf_offset.type = offset.type;
3444 emit_before(block, inst, MOV(grf_offset, offset));
3445
3446 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3447 temp, index, src_reg(grf_offset));
3448 load->mlen = 1;
3449 } else {
3450 load = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
3451 temp, index, offset);
3452 load->base_mrf = 14;
3453 load->mlen = 1;
3454 }
3455 emit_before(block, inst, load);
3456 }
3457
3458 /**
3459 * Implements array access of uniforms by inserting a
3460 * PULL_CONSTANT_LOAD instruction.
3461 *
3462 * Unlike temporary GRF array access (where we don't support it due to
3463 * the difficulty of doing relative addressing on instruction
3464 * destinations), we could potentially do array access of uniforms
3465 * that were loaded in GRF space as push constants. In real-world
3466 * usage we've seen, though, the arrays being used are always larger
3467 * than we could load as push constants, so just always move all
3468 * uniform array access out to a pull constant buffer.
3469 */
3470 void
3471 vec4_visitor::move_uniform_array_access_to_pull_constants()
3472 {
3473 int pull_constant_loc[this->uniforms];
3474 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3475 bool nested_reladdr;
3476
3477 /* Walk through and find array access of uniforms. Put a copy of that
3478 * uniform in the pull constant buffer.
3479 *
3480 * Note that we don't move constant-indexed accesses to arrays. No
3481 * testing has been done of the performance impact of this choice.
3482 */
3483 do {
3484 nested_reladdr = false;
3485
3486 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3487 for (int i = 0 ; i < 3; i++) {
3488 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3489 continue;
3490
3491 int uniform = inst->src[i].reg;
3492
3493 if (inst->src[i].reladdr->reladdr)
3494 nested_reladdr = true; /* will need another pass */
3495
3496 /* If this array isn't already present in the pull constant buffer,
3497 * add it.
3498 */
3499 if (pull_constant_loc[uniform] == -1) {
3500 const gl_constant_value **values =
3501 &stage_prog_data->param[uniform * 4];
3502
3503 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3504
3505 assert(uniform < uniform_array_size);
3506 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3507 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3508 = values[j];
3509 }
3510 }
3511
3512 /* Set up the annotation tracking for new generated instructions. */
3513 base_ir = inst->ir;
3514 current_annotation = inst->annotation;
3515
3516 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3517
3518 emit_pull_constant_load(block, inst, temp, inst->src[i],
3519 pull_constant_loc[uniform]);
3520
3521 inst->src[i].file = temp.file;
3522 inst->src[i].reg = temp.reg;
3523 inst->src[i].reg_offset = temp.reg_offset;
3524 inst->src[i].reladdr = NULL;
3525 }
3526 }
3527 } while (nested_reladdr);
3528
3529 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3530 * no need to track them as larger-than-vec4 objects. This will be
3531 * relied on in cutting out unused uniform vectors from push
3532 * constants.
3533 */
3534 split_uniform_registers();
3535 }
3536
3537 void
3538 vec4_visitor::resolve_ud_negate(src_reg *reg)
3539 {
3540 if (reg->type != BRW_REGISTER_TYPE_UD ||
3541 !reg->negate)
3542 return;
3543
3544 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3545 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3546 *reg = temp;
3547 }
3548
3549 /**
3550 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3551 *
3552 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3553 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3554 */
3555 void
3556 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3557 {
3558 assert(brw->gen <= 5);
3559
3560 if (!rvalue->type->is_boolean())
3561 return;
3562
3563 src_reg and_result = src_reg(this, rvalue->type);
3564 src_reg neg_result = src_reg(this, rvalue->type);
3565 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3566 emit(MOV(dst_reg(neg_result), negate(and_result)));
3567 *reg = neg_result;
3568 }
3569
3570 vec4_visitor::vec4_visitor(struct brw_context *brw,
3571 struct brw_vec4_compile *c,
3572 struct gl_program *prog,
3573 const struct brw_vue_prog_key *key,
3574 struct brw_vue_prog_data *prog_data,
3575 struct gl_shader_program *shader_prog,
3576 gl_shader_stage stage,
3577 void *mem_ctx,
3578 bool debug_flag,
3579 bool no_spills,
3580 shader_time_shader_type st_base,
3581 shader_time_shader_type st_written,
3582 shader_time_shader_type st_reset)
3583 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3584 c(c),
3585 key(key),
3586 prog_data(prog_data),
3587 sanity_param_count(0),
3588 fail_msg(NULL),
3589 first_non_payload_grf(0),
3590 need_all_constants_in_pull_buffer(false),
3591 debug_flag(debug_flag),
3592 no_spills(no_spills),
3593 st_base(st_base),
3594 st_written(st_written),
3595 st_reset(st_reset)
3596 {
3597 this->mem_ctx = mem_ctx;
3598 this->failed = false;
3599
3600 this->base_ir = NULL;
3601 this->current_annotation = NULL;
3602 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3603
3604 this->variable_ht = hash_table_ctor(0,
3605 hash_table_pointer_hash,
3606 hash_table_pointer_compare);
3607
3608 this->virtual_grf_start = NULL;
3609 this->virtual_grf_end = NULL;
3610 this->live_intervals = NULL;
3611
3612 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3613
3614 this->uniforms = 0;
3615
3616 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3617 * at least one. See setup_uniforms() in brw_vec4.cpp.
3618 */
3619 this->uniform_array_size = 1;
3620 if (prog_data) {
3621 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3622 }
3623
3624 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3625 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3626 }
3627
3628 vec4_visitor::~vec4_visitor()
3629 {
3630 hash_table_dtor(this->variable_ht);
3631 }
3632
3633
3634 void
3635 vec4_visitor::fail(const char *format, ...)
3636 {
3637 va_list va;
3638 char *msg;
3639
3640 if (failed)
3641 return;
3642
3643 failed = true;
3644
3645 va_start(va, format);
3646 msg = ralloc_vasprintf(mem_ctx, format, va);
3647 va_end(va);
3648 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3649
3650 this->fail_msg = msg;
3651
3652 if (debug_flag) {
3653 fprintf(stderr, "%s", msg);
3654 }
3655 }
3656
3657 } /* namespace brw */