i965: Set CMP's destination type to src0's type.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(vec4_visitor *v,
32 enum opcode opcode, const dst_reg &dst,
33 const src_reg &src0, const src_reg &src1,
34 const src_reg &src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->writes_accumulator = false;
46 this->conditional_mod = BRW_CONDITIONAL_NONE;
47 this->target = 0;
48 this->shadow_compare = false;
49 this->ir = v->base_ir;
50 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
51 this->header_present = false;
52 this->mlen = 0;
53 this->base_mrf = 0;
54 this->offset = 0;
55 this->annotation = v->current_annotation;
56 }
57
58 vec4_instruction *
59 vec4_visitor::emit(vec4_instruction *inst)
60 {
61 this->instructions.push_tail(inst);
62
63 return inst;
64 }
65
66 vec4_instruction *
67 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
68 vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(block, new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
80 const src_reg &src1, const src_reg &src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
89 const src_reg &src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* Take the instruction:
226 *
227 * CMP null<d> src0<f> src1<f>
228 *
229 * Original gen4 does type conversion to the destination type before
230 * comparison, producing garbage results for floating point comparisons.
231 *
232 * The destination type doesn't matter on newer generations, so we set the
233 * type to match src0 so we can compact the instruction.
234 */
235 dst.type = src0.type;
236 if (dst.file == HW_REG)
237 dst.fixed_hw_reg.type = dst.type;
238
239 resolve_ud_negate(&src0);
240 resolve_ud_negate(&src1);
241
242 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
243 inst->conditional_mod = condition;
244
245 return inst;
246 }
247
248 vec4_instruction *
249 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
250 {
251 vec4_instruction *inst;
252
253 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
254 dst, index);
255 inst->base_mrf = 14;
256 inst->mlen = 2;
257
258 return inst;
259 }
260
261 vec4_instruction *
262 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
263 const src_reg &index)
264 {
265 vec4_instruction *inst;
266
267 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
268 dst, src, index);
269 inst->base_mrf = 13;
270 inst->mlen = 3;
271
272 return inst;
273 }
274
275 void
276 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
277 {
278 static enum opcode dot_opcodes[] = {
279 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
280 };
281
282 emit(dot_opcodes[elements - 2], dst, src0, src1);
283 }
284
285 src_reg
286 vec4_visitor::fix_3src_operand(src_reg src)
287 {
288 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
289 * able to use vertical stride of zero to replicate the vec4 uniform, like
290 *
291 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
292 *
293 * But you can't, since vertical stride is always four in three-source
294 * instructions. Instead, insert a MOV instruction to do the replication so
295 * that the three-source instruction can consume it.
296 */
297
298 /* The MOV is only needed if the source is a uniform or immediate. */
299 if (src.file != UNIFORM && src.file != IMM)
300 return src;
301
302 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
303 return src;
304
305 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
306 expanded.type = src.type;
307 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
308 return src_reg(expanded);
309 }
310
311 src_reg
312 vec4_visitor::fix_math_operand(src_reg src)
313 {
314 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
315 return src;
316
317 /* The gen6 math instruction ignores the source modifiers --
318 * swizzle, abs, negate, and at least some parts of the register
319 * region description.
320 *
321 * Rather than trying to enumerate all these cases, *always* expand the
322 * operand to a temp GRF for gen6.
323 *
324 * For gen7, keep the operand as-is, except if immediate, which gen7 still
325 * can't use.
326 */
327
328 if (brw->gen == 7 && src.file != IMM)
329 return src;
330
331 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
332 expanded.type = src.type;
333 emit(MOV(expanded, src));
334 return src_reg(expanded);
335 }
336
337 void
338 vec4_visitor::emit_math(enum opcode opcode,
339 const dst_reg &dst,
340 const src_reg &src0, const src_reg &src1)
341 {
342 vec4_instruction *math =
343 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
344
345 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
346 /* MATH on Gen6 must be align1, so we can't do writemasks. */
347 math->dst = dst_reg(this, glsl_type::vec4_type);
348 math->dst.type = dst.type;
349 emit(MOV(dst, src_reg(math->dst)));
350 } else if (brw->gen < 6) {
351 math->base_mrf = 1;
352 math->mlen = src1.file == BAD_FILE ? 1 : 2;
353 }
354 }
355
356 void
357 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
358 {
359 if (brw->gen < 7) {
360 unreachable("ir_unop_pack_half_2x16 should be lowered");
361 }
362
363 assert(dst.type == BRW_REGISTER_TYPE_UD);
364 assert(src0.type == BRW_REGISTER_TYPE_F);
365
366 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
367 *
368 * Because this instruction does not have a 16-bit floating-point type,
369 * the destination data type must be Word (W).
370 *
371 * The destination must be DWord-aligned and specify a horizontal stride
372 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
373 * each destination channel and the upper word is not modified.
374 *
375 * The above restriction implies that the f32to16 instruction must use
376 * align1 mode, because only in align1 mode is it possible to specify
377 * horizontal stride. We choose here to defy the hardware docs and emit
378 * align16 instructions.
379 *
380 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
381 * instructions. I was partially successful in that the code passed all
382 * tests. However, the code was dubiously correct and fragile, and the
383 * tests were not harsh enough to probe that frailty. Not trusting the
384 * code, I chose instead to remain in align16 mode in defiance of the hw
385 * docs).
386 *
387 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
388 * simulator, emitting a f32to16 in align16 mode with UD as destination
389 * data type is safe. The behavior differs from that specified in the PRM
390 * in that the upper word of each destination channel is cleared to 0.
391 */
392
393 dst_reg tmp_dst(this, glsl_type::uvec2_type);
394 src_reg tmp_src(tmp_dst);
395
396 #if 0
397 /* Verify the undocumented behavior on which the following instructions
398 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
399 * then the result of the bit-or instruction below will be incorrect.
400 *
401 * You should inspect the disasm output in order to verify that the MOV is
402 * not optimized away.
403 */
404 emit(MOV(tmp_dst, src_reg(0x12345678u)));
405 #endif
406
407 /* Give tmp the form below, where "." means untouched.
408 *
409 * w z y x w z y x
410 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
411 *
412 * That the upper word of each write-channel be 0 is required for the
413 * following bit-shift and bit-or instructions to work. Note that this
414 * relies on the undocumented hardware behavior mentioned above.
415 */
416 tmp_dst.writemask = WRITEMASK_XY;
417 emit(F32TO16(tmp_dst, src0));
418
419 /* Give the write-channels of dst the form:
420 * 0xhhhh0000
421 */
422 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
423 emit(SHL(dst, tmp_src, src_reg(16u)));
424
425 /* Finally, give the write-channels of dst the form of packHalf2x16's
426 * output:
427 * 0xhhhhllll
428 */
429 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
430 emit(OR(dst, src_reg(dst), tmp_src));
431 }
432
433 void
434 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
435 {
436 if (brw->gen < 7) {
437 unreachable("ir_unop_unpack_half_2x16 should be lowered");
438 }
439
440 assert(dst.type == BRW_REGISTER_TYPE_F);
441 assert(src0.type == BRW_REGISTER_TYPE_UD);
442
443 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
444 *
445 * Because this instruction does not have a 16-bit floating-point type,
446 * the source data type must be Word (W). The destination type must be
447 * F (Float).
448 *
449 * To use W as the source data type, we must adjust horizontal strides,
450 * which is only possible in align1 mode. All my [chadv] attempts at
451 * emitting align1 instructions for unpackHalf2x16 failed to pass the
452 * Piglit tests, so I gave up.
453 *
454 * I've verified that, on gen7 hardware and the simulator, it is safe to
455 * emit f16to32 in align16 mode with UD as source data type.
456 */
457
458 dst_reg tmp_dst(this, glsl_type::uvec2_type);
459 src_reg tmp_src(tmp_dst);
460
461 tmp_dst.writemask = WRITEMASK_X;
462 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
463
464 tmp_dst.writemask = WRITEMASK_Y;
465 emit(SHR(tmp_dst, src0, src_reg(16u)));
466
467 dst.writemask = WRITEMASK_XY;
468 emit(F16TO32(dst, tmp_src));
469 }
470
471 void
472 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
473 {
474 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
475 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
476 * is not suitable to generate the shift values, but we can use the packed
477 * vector float and a type-converting MOV.
478 */
479 dst_reg shift(this, glsl_type::uvec4_type);
480 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
481
482 dst_reg shifted(this, glsl_type::uvec4_type);
483 src0.swizzle = BRW_SWIZZLE_XXXX;
484 emit(SHR(shifted, src0, src_reg(shift)));
485
486 shifted.type = BRW_REGISTER_TYPE_UB;
487 dst_reg f(this, glsl_type::vec4_type);
488 emit(MOV(f, src_reg(shifted)));
489
490 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
491 }
492
493 void
494 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
495 {
496 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
497 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
498 * is not suitable to generate the shift values, but we can use the packed
499 * vector float and a type-converting MOV.
500 */
501 dst_reg shift(this, glsl_type::uvec4_type);
502 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
503
504 dst_reg shifted(this, glsl_type::uvec4_type);
505 src0.swizzle = BRW_SWIZZLE_XXXX;
506 emit(SHR(shifted, src0, src_reg(shift)));
507
508 shifted.type = BRW_REGISTER_TYPE_B;
509 dst_reg f(this, glsl_type::vec4_type);
510 emit(MOV(f, src_reg(shifted)));
511
512 dst_reg scaled(this, glsl_type::vec4_type);
513 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
514
515 dst_reg max(this, glsl_type::vec4_type);
516 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
517 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
518 }
519
520 void
521 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
522 {
523 dst_reg saturated(this, glsl_type::vec4_type);
524 vec4_instruction *inst = emit(MOV(saturated, src0));
525 inst->saturate = true;
526
527 dst_reg scaled(this, glsl_type::vec4_type);
528 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
529
530 dst_reg rounded(this, glsl_type::vec4_type);
531 emit(RNDE(rounded, src_reg(scaled)));
532
533 dst_reg u(this, glsl_type::uvec4_type);
534 emit(MOV(u, src_reg(rounded)));
535
536 src_reg bytes(u);
537 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
538 }
539
540 void
541 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
542 {
543 dst_reg max(this, glsl_type::vec4_type);
544 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
545
546 dst_reg min(this, glsl_type::vec4_type);
547 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
548
549 dst_reg scaled(this, glsl_type::vec4_type);
550 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
551
552 dst_reg rounded(this, glsl_type::vec4_type);
553 emit(RNDE(rounded, src_reg(scaled)));
554
555 dst_reg i(this, glsl_type::ivec4_type);
556 emit(MOV(i, src_reg(rounded)));
557
558 src_reg bytes(i);
559 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
560 }
561
562 void
563 vec4_visitor::visit_instructions(const exec_list *list)
564 {
565 foreach_in_list(ir_instruction, ir, list) {
566 base_ir = ir;
567 ir->accept(this);
568 }
569 }
570
571
572 static int
573 type_size(const struct glsl_type *type)
574 {
575 unsigned int i;
576 int size;
577
578 switch (type->base_type) {
579 case GLSL_TYPE_UINT:
580 case GLSL_TYPE_INT:
581 case GLSL_TYPE_FLOAT:
582 case GLSL_TYPE_BOOL:
583 if (type->is_matrix()) {
584 return type->matrix_columns;
585 } else {
586 /* Regardless of size of vector, it gets a vec4. This is bad
587 * packing for things like floats, but otherwise arrays become a
588 * mess. Hopefully a later pass over the code can pack scalars
589 * down if appropriate.
590 */
591 return 1;
592 }
593 case GLSL_TYPE_ARRAY:
594 assert(type->length > 0);
595 return type_size(type->fields.array) * type->length;
596 case GLSL_TYPE_STRUCT:
597 size = 0;
598 for (i = 0; i < type->length; i++) {
599 size += type_size(type->fields.structure[i].type);
600 }
601 return size;
602 case GLSL_TYPE_SAMPLER:
603 /* Samplers take up no register space, since they're baked in at
604 * link time.
605 */
606 return 0;
607 case GLSL_TYPE_ATOMIC_UINT:
608 return 0;
609 case GLSL_TYPE_IMAGE:
610 case GLSL_TYPE_VOID:
611 case GLSL_TYPE_ERROR:
612 case GLSL_TYPE_INTERFACE:
613 unreachable("not reached");
614 }
615
616 return 0;
617 }
618
619 int
620 vec4_visitor::virtual_grf_alloc(int size)
621 {
622 if (virtual_grf_array_size <= virtual_grf_count) {
623 if (virtual_grf_array_size == 0)
624 virtual_grf_array_size = 16;
625 else
626 virtual_grf_array_size *= 2;
627 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
628 virtual_grf_array_size);
629 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
630 virtual_grf_array_size);
631 }
632 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
633 virtual_grf_reg_count += size;
634 virtual_grf_sizes[virtual_grf_count] = size;
635 return virtual_grf_count++;
636 }
637
638 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
639 {
640 init();
641
642 this->file = GRF;
643 this->reg = v->virtual_grf_alloc(type_size(type));
644
645 if (type->is_array() || type->is_record()) {
646 this->swizzle = BRW_SWIZZLE_NOOP;
647 } else {
648 this->swizzle = swizzle_for_size(type->vector_elements);
649 }
650
651 this->type = brw_type_for_base_type(type);
652 }
653
654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
655 {
656 assert(size > 0);
657
658 init();
659
660 this->file = GRF;
661 this->reg = v->virtual_grf_alloc(type_size(type) * size);
662
663 this->swizzle = BRW_SWIZZLE_NOOP;
664
665 this->type = brw_type_for_base_type(type);
666 }
667
668 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
669 {
670 init();
671
672 this->file = GRF;
673 this->reg = v->virtual_grf_alloc(type_size(type));
674
675 if (type->is_array() || type->is_record()) {
676 this->writemask = WRITEMASK_XYZW;
677 } else {
678 this->writemask = (1 << type->vector_elements) - 1;
679 }
680
681 this->type = brw_type_for_base_type(type);
682 }
683
684 /* Our support for uniforms is piggy-backed on the struct
685 * gl_fragment_program, because that's where the values actually
686 * get stored, rather than in some global gl_shader_program uniform
687 * store.
688 */
689 void
690 vec4_visitor::setup_uniform_values(ir_variable *ir)
691 {
692 int namelen = strlen(ir->name);
693
694 /* The data for our (non-builtin) uniforms is stored in a series of
695 * gl_uniform_driver_storage structs for each subcomponent that
696 * glGetUniformLocation() could name. We know it's been set up in the same
697 * order we'd walk the type, so walk the list of storage and find anything
698 * with our name, or the prefix of a component that starts with our name.
699 */
700 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
701 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
702
703 if (strncmp(ir->name, storage->name, namelen) != 0 ||
704 (storage->name[namelen] != 0 &&
705 storage->name[namelen] != '.' &&
706 storage->name[namelen] != '[')) {
707 continue;
708 }
709
710 gl_constant_value *components = storage->storage;
711 unsigned vector_count = (MAX2(storage->array_elements, 1) *
712 storage->type->matrix_columns);
713
714 for (unsigned s = 0; s < vector_count; s++) {
715 assert(uniforms < uniform_array_size);
716 uniform_vector_size[uniforms] = storage->type->vector_elements;
717
718 int i;
719 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
720 stage_prog_data->param[uniforms * 4 + i] = components;
721 components++;
722 }
723 for (; i < 4; i++) {
724 static gl_constant_value zero = { 0.0 };
725 stage_prog_data->param[uniforms * 4 + i] = &zero;
726 }
727
728 uniforms++;
729 }
730 }
731 }
732
733 void
734 vec4_visitor::setup_uniform_clipplane_values()
735 {
736 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
737
738 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
739 assert(this->uniforms < uniform_array_size);
740 this->uniform_vector_size[this->uniforms] = 4;
741 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
742 this->userplane[i].type = BRW_REGISTER_TYPE_F;
743 for (int j = 0; j < 4; ++j) {
744 stage_prog_data->param[this->uniforms * 4 + j] =
745 (gl_constant_value *) &clip_planes[i][j];
746 }
747 ++this->uniforms;
748 }
749 }
750
751 /* Our support for builtin uniforms is even scarier than non-builtin.
752 * It sits on top of the PROG_STATE_VAR parameters that are
753 * automatically updated from GL context state.
754 */
755 void
756 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
757 {
758 const ir_state_slot *const slots = ir->get_state_slots();
759 assert(slots != NULL);
760
761 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
762 /* This state reference has already been setup by ir_to_mesa,
763 * but we'll get the same index back here. We can reference
764 * ParameterValues directly, since unlike brw_fs.cpp, we never
765 * add new state references during compile.
766 */
767 int index = _mesa_add_state_reference(this->prog->Parameters,
768 (gl_state_index *)slots[i].tokens);
769 gl_constant_value *values =
770 &this->prog->Parameters->ParameterValues[index][0];
771
772 assert(this->uniforms < uniform_array_size);
773 this->uniform_vector_size[this->uniforms] = 0;
774 /* Add each of the unique swizzled channels of the element.
775 * This will end up matching the size of the glsl_type of this field.
776 */
777 int last_swiz = -1;
778 for (unsigned int j = 0; j < 4; j++) {
779 int swiz = GET_SWZ(slots[i].swizzle, j);
780 last_swiz = swiz;
781
782 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
783 assert(this->uniforms < uniform_array_size);
784 if (swiz <= last_swiz)
785 this->uniform_vector_size[this->uniforms]++;
786 }
787 this->uniforms++;
788 }
789 }
790
791 dst_reg *
792 vec4_visitor::variable_storage(ir_variable *var)
793 {
794 return (dst_reg *)hash_table_find(this->variable_ht, var);
795 }
796
797 void
798 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
799 enum brw_predicate *predicate)
800 {
801 ir_expression *expr = ir->as_expression();
802
803 *predicate = BRW_PREDICATE_NORMAL;
804
805 if (expr && expr->operation != ir_binop_ubo_load) {
806 src_reg op[3];
807 vec4_instruction *inst;
808
809 assert(expr->get_num_operands() <= 3);
810 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
811 expr->operands[i]->accept(this);
812 op[i] = this->result;
813
814 resolve_ud_negate(&op[i]);
815 }
816
817 switch (expr->operation) {
818 case ir_unop_logic_not:
819 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
820 inst->conditional_mod = BRW_CONDITIONAL_Z;
821 break;
822
823 case ir_binop_logic_xor:
824 if (brw->gen <= 5) {
825 src_reg temp = src_reg(this, ir->type);
826 emit(XOR(dst_reg(temp), op[0], op[1]));
827 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
828 } else {
829 inst = emit(XOR(dst_null_d(), op[0], op[1]));
830 }
831 inst->conditional_mod = BRW_CONDITIONAL_NZ;
832 break;
833
834 case ir_binop_logic_or:
835 if (brw->gen <= 5) {
836 src_reg temp = src_reg(this, ir->type);
837 emit(OR(dst_reg(temp), op[0], op[1]));
838 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
839 } else {
840 inst = emit(OR(dst_null_d(), op[0], op[1]));
841 }
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 break;
844
845 case ir_binop_logic_and:
846 if (brw->gen <= 5) {
847 src_reg temp = src_reg(this, ir->type);
848 emit(AND(dst_reg(temp), op[0], op[1]));
849 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
850 } else {
851 inst = emit(AND(dst_null_d(), op[0], op[1]));
852 }
853 inst->conditional_mod = BRW_CONDITIONAL_NZ;
854 break;
855
856 case ir_unop_f2b:
857 if (brw->gen >= 6) {
858 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
859 } else {
860 inst = emit(MOV(dst_null_f(), op[0]));
861 inst->conditional_mod = BRW_CONDITIONAL_NZ;
862 }
863 break;
864
865 case ir_unop_i2b:
866 if (brw->gen >= 6) {
867 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
868 } else {
869 inst = emit(MOV(dst_null_d(), op[0]));
870 inst->conditional_mod = BRW_CONDITIONAL_NZ;
871 }
872 break;
873
874 case ir_binop_all_equal:
875 if (brw->gen <= 5) {
876 resolve_bool_comparison(expr->operands[0], &op[0]);
877 resolve_bool_comparison(expr->operands[1], &op[1]);
878 }
879 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
880 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
881 break;
882
883 case ir_binop_any_nequal:
884 if (brw->gen <= 5) {
885 resolve_bool_comparison(expr->operands[0], &op[0]);
886 resolve_bool_comparison(expr->operands[1], &op[1]);
887 }
888 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
889 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
890 break;
891
892 case ir_unop_any:
893 if (brw->gen <= 5) {
894 resolve_bool_comparison(expr->operands[0], &op[0]);
895 }
896 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
897 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
898 break;
899
900 case ir_binop_greater:
901 case ir_binop_gequal:
902 case ir_binop_less:
903 case ir_binop_lequal:
904 case ir_binop_equal:
905 case ir_binop_nequal:
906 if (brw->gen <= 5) {
907 resolve_bool_comparison(expr->operands[0], &op[0]);
908 resolve_bool_comparison(expr->operands[1], &op[1]);
909 }
910 emit(CMP(dst_null_d(), op[0], op[1],
911 brw_conditional_for_comparison(expr->operation)));
912 break;
913
914 case ir_triop_csel: {
915 /* Expand the boolean condition into the flag register. */
916 inst = emit(MOV(dst_null_d(), op[0]));
917 inst->conditional_mod = BRW_CONDITIONAL_NZ;
918
919 /* Select which boolean to return. */
920 dst_reg temp(this, expr->operands[1]->type);
921 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
922 inst->predicate = BRW_PREDICATE_NORMAL;
923
924 /* Expand the result to a condition code. */
925 inst = emit(MOV(dst_null_d(), src_reg(temp)));
926 inst->conditional_mod = BRW_CONDITIONAL_NZ;
927 break;
928 }
929
930 default:
931 unreachable("not reached");
932 }
933 return;
934 }
935
936 ir->accept(this);
937
938 resolve_ud_negate(&this->result);
939
940 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
941 inst->conditional_mod = BRW_CONDITIONAL_NZ;
942 }
943
944 /**
945 * Emit a gen6 IF statement with the comparison folded into the IF
946 * instruction.
947 */
948 void
949 vec4_visitor::emit_if_gen6(ir_if *ir)
950 {
951 ir_expression *expr = ir->condition->as_expression();
952
953 if (expr && expr->operation != ir_binop_ubo_load) {
954 src_reg op[3];
955 dst_reg temp;
956
957 assert(expr->get_num_operands() <= 3);
958 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
959 expr->operands[i]->accept(this);
960 op[i] = this->result;
961 }
962
963 switch (expr->operation) {
964 case ir_unop_logic_not:
965 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
966 return;
967
968 case ir_binop_logic_xor:
969 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
970 return;
971
972 case ir_binop_logic_or:
973 temp = dst_reg(this, glsl_type::bool_type);
974 emit(OR(temp, op[0], op[1]));
975 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
976 return;
977
978 case ir_binop_logic_and:
979 temp = dst_reg(this, glsl_type::bool_type);
980 emit(AND(temp, op[0], op[1]));
981 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
982 return;
983
984 case ir_unop_f2b:
985 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
986 return;
987
988 case ir_unop_i2b:
989 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
990 return;
991
992 case ir_binop_greater:
993 case ir_binop_gequal:
994 case ir_binop_less:
995 case ir_binop_lequal:
996 case ir_binop_equal:
997 case ir_binop_nequal:
998 emit(IF(op[0], op[1],
999 brw_conditional_for_comparison(expr->operation)));
1000 return;
1001
1002 case ir_binop_all_equal:
1003 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1004 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1005 return;
1006
1007 case ir_binop_any_nequal:
1008 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1009 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1010 return;
1011
1012 case ir_unop_any:
1013 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1014 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1015 return;
1016
1017 case ir_triop_csel: {
1018 /* Expand the boolean condition into the flag register. */
1019 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1020 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1021
1022 /* Select which boolean to return. */
1023 dst_reg temp(this, expr->operands[1]->type);
1024 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1025 inst->predicate = BRW_PREDICATE_NORMAL;
1026
1027 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1028 return;
1029 }
1030
1031 default:
1032 unreachable("not reached");
1033 }
1034 return;
1035 }
1036
1037 ir->condition->accept(this);
1038
1039 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_variable *ir)
1044 {
1045 dst_reg *reg = NULL;
1046
1047 if (variable_storage(ir))
1048 return;
1049
1050 switch (ir->data.mode) {
1051 case ir_var_shader_in:
1052 assert(ir->data.location != -1);
1053 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1054 break;
1055
1056 case ir_var_shader_out:
1057 assert(ir->data.location != -1);
1058 reg = new(mem_ctx) dst_reg(this, ir->type);
1059
1060 for (int i = 0; i < type_size(ir->type); i++) {
1061 output_reg[ir->data.location + i] = *reg;
1062 output_reg[ir->data.location + i].reg_offset = i;
1063 output_reg[ir->data.location + i].type =
1064 brw_type_for_base_type(ir->type->get_scalar_type());
1065 output_reg_annotation[ir->data.location + i] = ir->name;
1066 }
1067 break;
1068
1069 case ir_var_auto:
1070 case ir_var_temporary:
1071 reg = new(mem_ctx) dst_reg(this, ir->type);
1072 break;
1073
1074 case ir_var_uniform:
1075 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1076
1077 /* Thanks to the lower_ubo_reference pass, we will see only
1078 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1079 * variables, so no need for them to be in variable_ht.
1080 *
1081 * Some uniforms, such as samplers and atomic counters, have no actual
1082 * storage, so we should ignore them.
1083 */
1084 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1085 return;
1086
1087 /* Track how big the whole uniform variable is, in case we need to put a
1088 * copy of its data into pull constants for array access.
1089 */
1090 assert(this->uniforms < uniform_array_size);
1091 this->uniform_size[this->uniforms] = type_size(ir->type);
1092
1093 if (!strncmp(ir->name, "gl_", 3)) {
1094 setup_builtin_uniform_values(ir);
1095 } else {
1096 setup_uniform_values(ir);
1097 }
1098 break;
1099
1100 case ir_var_system_value:
1101 reg = make_reg_for_system_value(ir);
1102 break;
1103
1104 default:
1105 unreachable("not reached");
1106 }
1107
1108 reg->type = brw_type_for_base_type(ir->type);
1109 hash_table_insert(this->variable_ht, reg, ir);
1110 }
1111
1112 void
1113 vec4_visitor::visit(ir_loop *ir)
1114 {
1115 /* We don't want debugging output to print the whole body of the
1116 * loop as the annotation.
1117 */
1118 this->base_ir = NULL;
1119
1120 emit(BRW_OPCODE_DO);
1121
1122 visit_instructions(&ir->body_instructions);
1123
1124 emit(BRW_OPCODE_WHILE);
1125 }
1126
1127 void
1128 vec4_visitor::visit(ir_loop_jump *ir)
1129 {
1130 switch (ir->mode) {
1131 case ir_loop_jump::jump_break:
1132 emit(BRW_OPCODE_BREAK);
1133 break;
1134 case ir_loop_jump::jump_continue:
1135 emit(BRW_OPCODE_CONTINUE);
1136 break;
1137 }
1138 }
1139
1140
1141 void
1142 vec4_visitor::visit(ir_function_signature *)
1143 {
1144 unreachable("not reached");
1145 }
1146
1147 void
1148 vec4_visitor::visit(ir_function *ir)
1149 {
1150 /* Ignore function bodies other than main() -- we shouldn't see calls to
1151 * them since they should all be inlined.
1152 */
1153 if (strcmp(ir->name, "main") == 0) {
1154 const ir_function_signature *sig;
1155 exec_list empty;
1156
1157 sig = ir->matching_signature(NULL, &empty, false);
1158
1159 assert(sig);
1160
1161 visit_instructions(&sig->body);
1162 }
1163 }
1164
1165 bool
1166 vec4_visitor::try_emit_mad(ir_expression *ir)
1167 {
1168 /* 3-src instructions were introduced in gen6. */
1169 if (brw->gen < 6)
1170 return false;
1171
1172 /* MAD can only handle floating-point data. */
1173 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1174 return false;
1175
1176 ir_rvalue *nonmul = ir->operands[1];
1177 ir_expression *mul = ir->operands[0]->as_expression();
1178
1179 if (!mul || mul->operation != ir_binop_mul) {
1180 nonmul = ir->operands[0];
1181 mul = ir->operands[1]->as_expression();
1182
1183 if (!mul || mul->operation != ir_binop_mul)
1184 return false;
1185 }
1186
1187 nonmul->accept(this);
1188 src_reg src0 = fix_3src_operand(this->result);
1189
1190 mul->operands[0]->accept(this);
1191 src_reg src1 = fix_3src_operand(this->result);
1192
1193 mul->operands[1]->accept(this);
1194 src_reg src2 = fix_3src_operand(this->result);
1195
1196 this->result = src_reg(this, ir->type);
1197 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1198
1199 return true;
1200 }
1201
1202 bool
1203 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1204 {
1205 /* This optimization relies on CMP setting the destination to 0 when
1206 * false. Early hardware only sets the least significant bit, and
1207 * leaves the other bits undefined. So we can't use it.
1208 */
1209 if (brw->gen < 6)
1210 return false;
1211
1212 ir_expression *const cmp = ir->operands[0]->as_expression();
1213
1214 if (cmp == NULL)
1215 return false;
1216
1217 switch (cmp->operation) {
1218 case ir_binop_less:
1219 case ir_binop_greater:
1220 case ir_binop_lequal:
1221 case ir_binop_gequal:
1222 case ir_binop_equal:
1223 case ir_binop_nequal:
1224 break;
1225
1226 default:
1227 return false;
1228 }
1229
1230 cmp->operands[0]->accept(this);
1231 const src_reg cmp_src0 = this->result;
1232
1233 cmp->operands[1]->accept(this);
1234 const src_reg cmp_src1 = this->result;
1235
1236 this->result = src_reg(this, ir->type);
1237
1238 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1239 brw_conditional_for_comparison(cmp->operation)));
1240
1241 /* If the comparison is false, this->result will just happen to be zero.
1242 */
1243 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1244 this->result, src_reg(1.0f));
1245 inst->predicate = BRW_PREDICATE_NORMAL;
1246 inst->predicate_inverse = true;
1247
1248 return true;
1249 }
1250
1251 void
1252 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1253 src_reg src0, src_reg src1)
1254 {
1255 vec4_instruction *inst;
1256
1257 if (brw->gen >= 6) {
1258 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1259 inst->conditional_mod = conditionalmod;
1260 } else {
1261 emit(CMP(dst, src0, src1, conditionalmod));
1262
1263 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1264 inst->predicate = BRW_PREDICATE_NORMAL;
1265 }
1266 }
1267
1268 void
1269 vec4_visitor::emit_lrp(const dst_reg &dst,
1270 const src_reg &x, const src_reg &y, const src_reg &a)
1271 {
1272 if (brw->gen >= 6) {
1273 /* Note that the instruction's argument order is reversed from GLSL
1274 * and the IR.
1275 */
1276 emit(LRP(dst,
1277 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1278 } else {
1279 /* Earlier generations don't support three source operations, so we
1280 * need to emit x*(1-a) + y*a.
1281 */
1282 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1283 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1284 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1285 y_times_a.writemask = dst.writemask;
1286 one_minus_a.writemask = dst.writemask;
1287 x_times_one_minus_a.writemask = dst.writemask;
1288
1289 emit(MUL(y_times_a, y, a));
1290 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1291 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1292 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1293 }
1294 }
1295
1296 void
1297 vec4_visitor::visit(ir_expression *ir)
1298 {
1299 unsigned int operand;
1300 src_reg op[Elements(ir->operands)];
1301 vec4_instruction *inst;
1302
1303 if (ir->operation == ir_binop_add) {
1304 if (try_emit_mad(ir))
1305 return;
1306 }
1307
1308 if (ir->operation == ir_unop_b2f) {
1309 if (try_emit_b2f_of_compare(ir))
1310 return;
1311 }
1312
1313 /* Storage for our result. Ideally for an assignment we'd be using
1314 * the actual storage for the result here, instead.
1315 */
1316 dst_reg result_dst(this, ir->type);
1317 src_reg result_src(result_dst);
1318
1319 if (ir->operation == ir_triop_csel) {
1320 ir->operands[1]->accept(this);
1321 op[1] = this->result;
1322 ir->operands[2]->accept(this);
1323 op[2] = this->result;
1324
1325 enum brw_predicate predicate;
1326 emit_bool_to_cond_code(ir->operands[0], &predicate);
1327 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1328 inst->predicate = predicate;
1329 this->result = result_src;
1330 return;
1331 }
1332
1333 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1334 this->result.file = BAD_FILE;
1335 ir->operands[operand]->accept(this);
1336 if (this->result.file == BAD_FILE) {
1337 fprintf(stderr, "Failed to get tree for expression operand:\n");
1338 ir->operands[operand]->fprint(stderr);
1339 exit(1);
1340 }
1341 op[operand] = this->result;
1342
1343 /* Matrix expression operands should have been broken down to vector
1344 * operations already.
1345 */
1346 assert(!ir->operands[operand]->type->is_matrix());
1347 }
1348
1349 /* If nothing special happens, this is the result. */
1350 this->result = result_src;
1351
1352 switch (ir->operation) {
1353 case ir_unop_logic_not:
1354 emit(NOT(result_dst, op[0]));
1355 break;
1356 case ir_unop_neg:
1357 op[0].negate = !op[0].negate;
1358 emit(MOV(result_dst, op[0]));
1359 break;
1360 case ir_unop_abs:
1361 op[0].abs = true;
1362 op[0].negate = false;
1363 emit(MOV(result_dst, op[0]));
1364 break;
1365
1366 case ir_unop_sign:
1367 if (ir->type->is_float()) {
1368 /* AND(val, 0x80000000) gives the sign bit.
1369 *
1370 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1371 * zero.
1372 */
1373 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1374
1375 op[0].type = BRW_REGISTER_TYPE_UD;
1376 result_dst.type = BRW_REGISTER_TYPE_UD;
1377 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1378
1379 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1380 inst->predicate = BRW_PREDICATE_NORMAL;
1381
1382 this->result.type = BRW_REGISTER_TYPE_F;
1383 } else {
1384 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1385 * -> non-negative val generates 0x00000000.
1386 * Predicated OR sets 1 if val is positive.
1387 */
1388 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1389
1390 emit(ASR(result_dst, op[0], src_reg(31)));
1391
1392 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1393 inst->predicate = BRW_PREDICATE_NORMAL;
1394 }
1395 break;
1396
1397 case ir_unop_rcp:
1398 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1399 break;
1400
1401 case ir_unop_exp2:
1402 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1403 break;
1404 case ir_unop_log2:
1405 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1406 break;
1407 case ir_unop_exp:
1408 case ir_unop_log:
1409 unreachable("not reached: should be handled by ir_explog_to_explog2");
1410 case ir_unop_sin:
1411 case ir_unop_sin_reduced:
1412 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1413 break;
1414 case ir_unop_cos:
1415 case ir_unop_cos_reduced:
1416 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1417 break;
1418
1419 case ir_unop_dFdx:
1420 case ir_unop_dFdx_coarse:
1421 case ir_unop_dFdx_fine:
1422 case ir_unop_dFdy:
1423 case ir_unop_dFdy_coarse:
1424 case ir_unop_dFdy_fine:
1425 unreachable("derivatives not valid in vertex shader");
1426
1427 case ir_unop_bitfield_reverse:
1428 emit(BFREV(result_dst, op[0]));
1429 break;
1430 case ir_unop_bit_count:
1431 emit(CBIT(result_dst, op[0]));
1432 break;
1433 case ir_unop_find_msb: {
1434 src_reg temp = src_reg(this, glsl_type::uint_type);
1435
1436 inst = emit(FBH(dst_reg(temp), op[0]));
1437 inst->dst.writemask = WRITEMASK_XYZW;
1438
1439 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1440 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1441 * subtract the result from 31 to convert the MSB count into an LSB count.
1442 */
1443
1444 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1445 temp.swizzle = BRW_SWIZZLE_NOOP;
1446 emit(MOV(result_dst, temp));
1447
1448 src_reg src_tmp = src_reg(result_dst);
1449 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1450
1451 src_tmp.negate = true;
1452 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1453 inst->predicate = BRW_PREDICATE_NORMAL;
1454 break;
1455 }
1456 case ir_unop_find_lsb:
1457 emit(FBL(result_dst, op[0]));
1458 break;
1459 case ir_unop_saturate:
1460 inst = emit(MOV(result_dst, op[0]));
1461 inst->saturate = true;
1462 break;
1463
1464 case ir_unop_noise:
1465 unreachable("not reached: should be handled by lower_noise");
1466
1467 case ir_binop_add:
1468 emit(ADD(result_dst, op[0], op[1]));
1469 break;
1470 case ir_binop_sub:
1471 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1472
1473 case ir_binop_mul:
1474 if (brw->gen < 8 && ir->type->is_integer()) {
1475 /* For integer multiplication, the MUL uses the low 16 bits of one of
1476 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1477 * accumulates in the contribution of the upper 16 bits of that
1478 * operand. If we can determine that one of the args is in the low
1479 * 16 bits, though, we can just emit a single MUL.
1480 */
1481 if (ir->operands[0]->is_uint16_constant()) {
1482 if (brw->gen < 7)
1483 emit(MUL(result_dst, op[0], op[1]));
1484 else
1485 emit(MUL(result_dst, op[1], op[0]));
1486 } else if (ir->operands[1]->is_uint16_constant()) {
1487 if (brw->gen < 7)
1488 emit(MUL(result_dst, op[1], op[0]));
1489 else
1490 emit(MUL(result_dst, op[0], op[1]));
1491 } else {
1492 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1493
1494 emit(MUL(acc, op[0], op[1]));
1495 emit(MACH(dst_null_d(), op[0], op[1]));
1496 emit(MOV(result_dst, src_reg(acc)));
1497 }
1498 } else {
1499 emit(MUL(result_dst, op[0], op[1]));
1500 }
1501 break;
1502 case ir_binop_imul_high: {
1503 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1504
1505 emit(MUL(acc, op[0], op[1]));
1506 emit(MACH(result_dst, op[0], op[1]));
1507 break;
1508 }
1509 case ir_binop_div:
1510 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1511 assert(ir->type->is_integer());
1512 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1513 break;
1514 case ir_binop_carry: {
1515 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1516
1517 emit(ADDC(dst_null_ud(), op[0], op[1]));
1518 emit(MOV(result_dst, src_reg(acc)));
1519 break;
1520 }
1521 case ir_binop_borrow: {
1522 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1523
1524 emit(SUBB(dst_null_ud(), op[0], op[1]));
1525 emit(MOV(result_dst, src_reg(acc)));
1526 break;
1527 }
1528 case ir_binop_mod:
1529 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1530 assert(ir->type->is_integer());
1531 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1532 break;
1533
1534 case ir_binop_less:
1535 case ir_binop_greater:
1536 case ir_binop_lequal:
1537 case ir_binop_gequal:
1538 case ir_binop_equal:
1539 case ir_binop_nequal: {
1540 if (brw->gen <= 5) {
1541 resolve_bool_comparison(ir->operands[0], &op[0]);
1542 resolve_bool_comparison(ir->operands[1], &op[1]);
1543 }
1544 emit(CMP(result_dst, op[0], op[1],
1545 brw_conditional_for_comparison(ir->operation)));
1546 break;
1547 }
1548
1549 case ir_binop_all_equal:
1550 /* "==" operator producing a scalar boolean. */
1551 if (ir->operands[0]->type->is_vector() ||
1552 ir->operands[1]->type->is_vector()) {
1553 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1554 emit(MOV(result_dst, src_reg(0)));
1555 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1556 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1557 } else {
1558 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1559 }
1560 break;
1561 case ir_binop_any_nequal:
1562 /* "!=" operator producing a scalar boolean. */
1563 if (ir->operands[0]->type->is_vector() ||
1564 ir->operands[1]->type->is_vector()) {
1565 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1566
1567 emit(MOV(result_dst, src_reg(0)));
1568 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1569 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1570 } else {
1571 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1572 }
1573 break;
1574
1575 case ir_unop_any:
1576 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1577 emit(MOV(result_dst, src_reg(0)));
1578
1579 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1580 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1581 break;
1582
1583 case ir_binop_logic_xor:
1584 emit(XOR(result_dst, op[0], op[1]));
1585 break;
1586
1587 case ir_binop_logic_or:
1588 emit(OR(result_dst, op[0], op[1]));
1589 break;
1590
1591 case ir_binop_logic_and:
1592 emit(AND(result_dst, op[0], op[1]));
1593 break;
1594
1595 case ir_binop_dot:
1596 assert(ir->operands[0]->type->is_vector());
1597 assert(ir->operands[0]->type == ir->operands[1]->type);
1598 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1599 break;
1600
1601 case ir_unop_sqrt:
1602 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1603 break;
1604 case ir_unop_rsq:
1605 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1606 break;
1607
1608 case ir_unop_bitcast_i2f:
1609 case ir_unop_bitcast_u2f:
1610 this->result = op[0];
1611 this->result.type = BRW_REGISTER_TYPE_F;
1612 break;
1613
1614 case ir_unop_bitcast_f2i:
1615 this->result = op[0];
1616 this->result.type = BRW_REGISTER_TYPE_D;
1617 break;
1618
1619 case ir_unop_bitcast_f2u:
1620 this->result = op[0];
1621 this->result.type = BRW_REGISTER_TYPE_UD;
1622 break;
1623
1624 case ir_unop_i2f:
1625 case ir_unop_i2u:
1626 case ir_unop_u2i:
1627 case ir_unop_u2f:
1628 case ir_unop_f2i:
1629 case ir_unop_f2u:
1630 emit(MOV(result_dst, op[0]));
1631 break;
1632 case ir_unop_b2i:
1633 emit(AND(result_dst, op[0], src_reg(1)));
1634 break;
1635 case ir_unop_b2f:
1636 if (brw->gen <= 5) {
1637 resolve_bool_comparison(ir->operands[0], &op[0]);
1638 }
1639 op[0].type = BRW_REGISTER_TYPE_D;
1640 result_dst.type = BRW_REGISTER_TYPE_D;
1641 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1642 result_dst.type = BRW_REGISTER_TYPE_F;
1643 break;
1644 case ir_unop_f2b:
1645 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1646 break;
1647 case ir_unop_i2b:
1648 emit(AND(result_dst, op[0], src_reg(1)));
1649 break;
1650
1651 case ir_unop_trunc:
1652 emit(RNDZ(result_dst, op[0]));
1653 break;
1654 case ir_unop_ceil: {
1655 src_reg tmp = src_reg(this, ir->type);
1656 op[0].negate = !op[0].negate;
1657 emit(RNDD(dst_reg(tmp), op[0]));
1658 tmp.negate = true;
1659 emit(MOV(result_dst, tmp));
1660 }
1661 break;
1662 case ir_unop_floor:
1663 inst = emit(RNDD(result_dst, op[0]));
1664 break;
1665 case ir_unop_fract:
1666 inst = emit(FRC(result_dst, op[0]));
1667 break;
1668 case ir_unop_round_even:
1669 emit(RNDE(result_dst, op[0]));
1670 break;
1671
1672 case ir_binop_min:
1673 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1674 break;
1675 case ir_binop_max:
1676 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1677 break;
1678
1679 case ir_binop_pow:
1680 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1681 break;
1682
1683 case ir_unop_bit_not:
1684 inst = emit(NOT(result_dst, op[0]));
1685 break;
1686 case ir_binop_bit_and:
1687 inst = emit(AND(result_dst, op[0], op[1]));
1688 break;
1689 case ir_binop_bit_xor:
1690 inst = emit(XOR(result_dst, op[0], op[1]));
1691 break;
1692 case ir_binop_bit_or:
1693 inst = emit(OR(result_dst, op[0], op[1]));
1694 break;
1695
1696 case ir_binop_lshift:
1697 inst = emit(SHL(result_dst, op[0], op[1]));
1698 break;
1699
1700 case ir_binop_rshift:
1701 if (ir->type->base_type == GLSL_TYPE_INT)
1702 inst = emit(ASR(result_dst, op[0], op[1]));
1703 else
1704 inst = emit(SHR(result_dst, op[0], op[1]));
1705 break;
1706
1707 case ir_binop_bfm:
1708 emit(BFI1(result_dst, op[0], op[1]));
1709 break;
1710
1711 case ir_binop_ubo_load: {
1712 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1713 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1714 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1715 src_reg offset;
1716
1717 /* Now, load the vector from that offset. */
1718 assert(ir->type->is_vector() || ir->type->is_scalar());
1719
1720 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1721 packed_consts.type = result.type;
1722 src_reg surf_index;
1723
1724 if (const_uniform_block) {
1725 /* The block index is a constant, so just emit the binding table entry
1726 * as an immediate.
1727 */
1728 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1729 const_uniform_block->value.u[0]);
1730 } else {
1731 /* The block index is not a constant. Evaluate the index expression
1732 * per-channel and add the base UBO index; the generator will select
1733 * a value from any live channel.
1734 */
1735 surf_index = src_reg(this, glsl_type::uint_type);
1736 emit(ADD(dst_reg(surf_index), op[0],
1737 src_reg(prog_data->base.binding_table.ubo_start)));
1738
1739 /* Assume this may touch any UBO. It would be nice to provide
1740 * a tighter bound, but the array information is already lowered away.
1741 */
1742 brw_mark_surface_used(&prog_data->base,
1743 prog_data->base.binding_table.ubo_start +
1744 shader_prog->NumUniformBlocks - 1);
1745 }
1746
1747 if (const_offset_ir) {
1748 if (brw->gen >= 8) {
1749 /* Store the offset in a GRF so we can send-from-GRF. */
1750 offset = src_reg(this, glsl_type::int_type);
1751 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1752 } else {
1753 /* Immediates are fine on older generations since they'll be moved
1754 * to a (potentially fake) MRF at the generator level.
1755 */
1756 offset = src_reg(const_offset / 16);
1757 }
1758 } else {
1759 offset = src_reg(this, glsl_type::uint_type);
1760 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1761 }
1762
1763 if (brw->gen >= 7) {
1764 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1765 grf_offset.type = offset.type;
1766
1767 emit(MOV(grf_offset, offset));
1768
1769 emit(new(mem_ctx) vec4_instruction(this,
1770 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1771 dst_reg(packed_consts),
1772 surf_index,
1773 src_reg(grf_offset)));
1774 } else {
1775 vec4_instruction *pull =
1776 emit(new(mem_ctx) vec4_instruction(this,
1777 VS_OPCODE_PULL_CONSTANT_LOAD,
1778 dst_reg(packed_consts),
1779 surf_index,
1780 offset));
1781 pull->base_mrf = 14;
1782 pull->mlen = 1;
1783 }
1784
1785 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1786 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1787 const_offset % 16 / 4,
1788 const_offset % 16 / 4,
1789 const_offset % 16 / 4);
1790
1791 /* UBO bools are any nonzero int. We need to convert them to use the
1792 * value of true stored in ctx->Const.UniformBooleanTrue.
1793 */
1794 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1795 emit(CMP(result_dst, packed_consts, src_reg(0u),
1796 BRW_CONDITIONAL_NZ));
1797 } else {
1798 emit(MOV(result_dst, packed_consts));
1799 }
1800 break;
1801 }
1802
1803 case ir_binop_vector_extract:
1804 unreachable("should have been lowered by vec_index_to_cond_assign");
1805
1806 case ir_triop_fma:
1807 op[0] = fix_3src_operand(op[0]);
1808 op[1] = fix_3src_operand(op[1]);
1809 op[2] = fix_3src_operand(op[2]);
1810 /* Note that the instruction's argument order is reversed from GLSL
1811 * and the IR.
1812 */
1813 emit(MAD(result_dst, op[2], op[1], op[0]));
1814 break;
1815
1816 case ir_triop_lrp:
1817 emit_lrp(result_dst, op[0], op[1], op[2]);
1818 break;
1819
1820 case ir_triop_csel:
1821 unreachable("already handled above");
1822 break;
1823
1824 case ir_triop_bfi:
1825 op[0] = fix_3src_operand(op[0]);
1826 op[1] = fix_3src_operand(op[1]);
1827 op[2] = fix_3src_operand(op[2]);
1828 emit(BFI2(result_dst, op[0], op[1], op[2]));
1829 break;
1830
1831 case ir_triop_bitfield_extract:
1832 op[0] = fix_3src_operand(op[0]);
1833 op[1] = fix_3src_operand(op[1]);
1834 op[2] = fix_3src_operand(op[2]);
1835 /* Note that the instruction's argument order is reversed from GLSL
1836 * and the IR.
1837 */
1838 emit(BFE(result_dst, op[2], op[1], op[0]));
1839 break;
1840
1841 case ir_triop_vector_insert:
1842 unreachable("should have been lowered by lower_vector_insert");
1843
1844 case ir_quadop_bitfield_insert:
1845 unreachable("not reached: should be handled by "
1846 "bitfield_insert_to_bfm_bfi\n");
1847
1848 case ir_quadop_vector:
1849 unreachable("not reached: should be handled by lower_quadop_vector");
1850
1851 case ir_unop_pack_half_2x16:
1852 emit_pack_half_2x16(result_dst, op[0]);
1853 break;
1854 case ir_unop_unpack_half_2x16:
1855 emit_unpack_half_2x16(result_dst, op[0]);
1856 break;
1857 case ir_unop_unpack_unorm_4x8:
1858 emit_unpack_unorm_4x8(result_dst, op[0]);
1859 break;
1860 case ir_unop_unpack_snorm_4x8:
1861 emit_unpack_snorm_4x8(result_dst, op[0]);
1862 break;
1863 case ir_unop_pack_unorm_4x8:
1864 emit_pack_unorm_4x8(result_dst, op[0]);
1865 break;
1866 case ir_unop_pack_snorm_4x8:
1867 emit_pack_snorm_4x8(result_dst, op[0]);
1868 break;
1869 case ir_unop_pack_snorm_2x16:
1870 case ir_unop_pack_unorm_2x16:
1871 case ir_unop_unpack_snorm_2x16:
1872 case ir_unop_unpack_unorm_2x16:
1873 unreachable("not reached: should be handled by lower_packing_builtins");
1874 case ir_unop_unpack_half_2x16_split_x:
1875 case ir_unop_unpack_half_2x16_split_y:
1876 case ir_binop_pack_half_2x16_split:
1877 case ir_unop_interpolate_at_centroid:
1878 case ir_binop_interpolate_at_sample:
1879 case ir_binop_interpolate_at_offset:
1880 unreachable("not reached: should not occur in vertex shader");
1881 case ir_binop_ldexp:
1882 unreachable("not reached: should be handled by ldexp_to_arith()");
1883 }
1884 }
1885
1886
1887 void
1888 vec4_visitor::visit(ir_swizzle *ir)
1889 {
1890 src_reg src;
1891 int i = 0;
1892 int swizzle[4];
1893
1894 /* Note that this is only swizzles in expressions, not those on the left
1895 * hand side of an assignment, which do write masking. See ir_assignment
1896 * for that.
1897 */
1898
1899 ir->val->accept(this);
1900 src = this->result;
1901 assert(src.file != BAD_FILE);
1902
1903 for (i = 0; i < ir->type->vector_elements; i++) {
1904 switch (i) {
1905 case 0:
1906 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1907 break;
1908 case 1:
1909 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1910 break;
1911 case 2:
1912 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1913 break;
1914 case 3:
1915 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1916 break;
1917 }
1918 }
1919 for (; i < 4; i++) {
1920 /* Replicate the last channel out. */
1921 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1922 }
1923
1924 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1925
1926 this->result = src;
1927 }
1928
1929 void
1930 vec4_visitor::visit(ir_dereference_variable *ir)
1931 {
1932 const struct glsl_type *type = ir->type;
1933 dst_reg *reg = variable_storage(ir->var);
1934
1935 if (!reg) {
1936 fail("Failed to find variable storage for %s\n", ir->var->name);
1937 this->result = src_reg(brw_null_reg());
1938 return;
1939 }
1940
1941 this->result = src_reg(*reg);
1942
1943 /* System values get their swizzle from the dst_reg writemask */
1944 if (ir->var->data.mode == ir_var_system_value)
1945 return;
1946
1947 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1948 this->result.swizzle = swizzle_for_size(type->vector_elements);
1949 }
1950
1951
1952 int
1953 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1954 {
1955 /* Under normal circumstances array elements are stored consecutively, so
1956 * the stride is equal to the size of the array element.
1957 */
1958 return type_size(ir->type);
1959 }
1960
1961
1962 void
1963 vec4_visitor::visit(ir_dereference_array *ir)
1964 {
1965 ir_constant *constant_index;
1966 src_reg src;
1967 int array_stride = compute_array_stride(ir);
1968
1969 constant_index = ir->array_index->constant_expression_value();
1970
1971 ir->array->accept(this);
1972 src = this->result;
1973
1974 if (constant_index) {
1975 src.reg_offset += constant_index->value.i[0] * array_stride;
1976 } else {
1977 /* Variable index array dereference. It eats the "vec4" of the
1978 * base of the array and an index that offsets the Mesa register
1979 * index.
1980 */
1981 ir->array_index->accept(this);
1982
1983 src_reg index_reg;
1984
1985 if (array_stride == 1) {
1986 index_reg = this->result;
1987 } else {
1988 index_reg = src_reg(this, glsl_type::int_type);
1989
1990 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1991 }
1992
1993 if (src.reladdr) {
1994 src_reg temp = src_reg(this, glsl_type::int_type);
1995
1996 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1997
1998 index_reg = temp;
1999 }
2000
2001 src.reladdr = ralloc(mem_ctx, src_reg);
2002 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2003 }
2004
2005 /* If the type is smaller than a vec4, replicate the last channel out. */
2006 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2007 src.swizzle = swizzle_for_size(ir->type->vector_elements);
2008 else
2009 src.swizzle = BRW_SWIZZLE_NOOP;
2010 src.type = brw_type_for_base_type(ir->type);
2011
2012 this->result = src;
2013 }
2014
2015 void
2016 vec4_visitor::visit(ir_dereference_record *ir)
2017 {
2018 unsigned int i;
2019 const glsl_type *struct_type = ir->record->type;
2020 int offset = 0;
2021
2022 ir->record->accept(this);
2023
2024 for (i = 0; i < struct_type->length; i++) {
2025 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2026 break;
2027 offset += type_size(struct_type->fields.structure[i].type);
2028 }
2029
2030 /* If the type is smaller than a vec4, replicate the last channel out. */
2031 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2032 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2033 else
2034 this->result.swizzle = BRW_SWIZZLE_NOOP;
2035 this->result.type = brw_type_for_base_type(ir->type);
2036
2037 this->result.reg_offset += offset;
2038 }
2039
2040 /**
2041 * We want to be careful in assignment setup to hit the actual storage
2042 * instead of potentially using a temporary like we might with the
2043 * ir_dereference handler.
2044 */
2045 static dst_reg
2046 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2047 {
2048 /* The LHS must be a dereference. If the LHS is a variable indexed array
2049 * access of a vector, it must be separated into a series conditional moves
2050 * before reaching this point (see ir_vec_index_to_cond_assign).
2051 */
2052 assert(ir->as_dereference());
2053 ir_dereference_array *deref_array = ir->as_dereference_array();
2054 if (deref_array) {
2055 assert(!deref_array->array->type->is_vector());
2056 }
2057
2058 /* Use the rvalue deref handler for the most part. We'll ignore
2059 * swizzles in it and write swizzles using writemask, though.
2060 */
2061 ir->accept(v);
2062 return dst_reg(v->result);
2063 }
2064
2065 void
2066 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2067 const struct glsl_type *type,
2068 enum brw_predicate predicate)
2069 {
2070 if (type->base_type == GLSL_TYPE_STRUCT) {
2071 for (unsigned int i = 0; i < type->length; i++) {
2072 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2073 }
2074 return;
2075 }
2076
2077 if (type->is_array()) {
2078 for (unsigned int i = 0; i < type->length; i++) {
2079 emit_block_move(dst, src, type->fields.array, predicate);
2080 }
2081 return;
2082 }
2083
2084 if (type->is_matrix()) {
2085 const struct glsl_type *vec_type;
2086
2087 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2088 type->vector_elements, 1);
2089
2090 for (int i = 0; i < type->matrix_columns; i++) {
2091 emit_block_move(dst, src, vec_type, predicate);
2092 }
2093 return;
2094 }
2095
2096 assert(type->is_scalar() || type->is_vector());
2097
2098 dst->type = brw_type_for_base_type(type);
2099 src->type = dst->type;
2100
2101 dst->writemask = (1 << type->vector_elements) - 1;
2102
2103 src->swizzle = swizzle_for_size(type->vector_elements);
2104
2105 vec4_instruction *inst = emit(MOV(*dst, *src));
2106 inst->predicate = predicate;
2107
2108 dst->reg_offset++;
2109 src->reg_offset++;
2110 }
2111
2112
2113 /* If the RHS processing resulted in an instruction generating a
2114 * temporary value, and it would be easy to rewrite the instruction to
2115 * generate its result right into the LHS instead, do so. This ends
2116 * up reliably removing instructions where it can be tricky to do so
2117 * later without real UD chain information.
2118 */
2119 bool
2120 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2121 dst_reg dst,
2122 src_reg src,
2123 vec4_instruction *pre_rhs_inst,
2124 vec4_instruction *last_rhs_inst)
2125 {
2126 /* This could be supported, but it would take more smarts. */
2127 if (ir->condition)
2128 return false;
2129
2130 if (pre_rhs_inst == last_rhs_inst)
2131 return false; /* No instructions generated to work with. */
2132
2133 /* Make sure the last instruction generated our source reg. */
2134 if (src.file != GRF ||
2135 src.file != last_rhs_inst->dst.file ||
2136 src.reg != last_rhs_inst->dst.reg ||
2137 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2138 src.reladdr ||
2139 src.abs ||
2140 src.negate ||
2141 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2142 return false;
2143
2144 /* Check that that last instruction fully initialized the channels
2145 * we want to use, in the order we want to use them. We could
2146 * potentially reswizzle the operands of many instructions so that
2147 * we could handle out of order channels, but don't yet.
2148 */
2149
2150 for (unsigned i = 0; i < 4; i++) {
2151 if (dst.writemask & (1 << i)) {
2152 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2153 return false;
2154
2155 if (BRW_GET_SWZ(src.swizzle, i) != i)
2156 return false;
2157 }
2158 }
2159
2160 /* Success! Rewrite the instruction. */
2161 last_rhs_inst->dst.file = dst.file;
2162 last_rhs_inst->dst.reg = dst.reg;
2163 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2164 last_rhs_inst->dst.reladdr = dst.reladdr;
2165 last_rhs_inst->dst.writemask &= dst.writemask;
2166
2167 return true;
2168 }
2169
2170 void
2171 vec4_visitor::visit(ir_assignment *ir)
2172 {
2173 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2174 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2175
2176 if (!ir->lhs->type->is_scalar() &&
2177 !ir->lhs->type->is_vector()) {
2178 ir->rhs->accept(this);
2179 src_reg src = this->result;
2180
2181 if (ir->condition) {
2182 emit_bool_to_cond_code(ir->condition, &predicate);
2183 }
2184
2185 /* emit_block_move doesn't account for swizzles in the source register.
2186 * This should be ok, since the source register is a structure or an
2187 * array, and those can't be swizzled. But double-check to be sure.
2188 */
2189 assert(src.swizzle ==
2190 (ir->rhs->type->is_matrix()
2191 ? swizzle_for_size(ir->rhs->type->vector_elements)
2192 : BRW_SWIZZLE_NOOP));
2193
2194 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2195 return;
2196 }
2197
2198 /* Now we're down to just a scalar/vector with writemasks. */
2199 int i;
2200
2201 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2202 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2203
2204 ir->rhs->accept(this);
2205
2206 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2207
2208 src_reg src = this->result;
2209
2210 int swizzles[4];
2211 int first_enabled_chan = 0;
2212 int src_chan = 0;
2213
2214 assert(ir->lhs->type->is_vector() ||
2215 ir->lhs->type->is_scalar());
2216 dst.writemask = ir->write_mask;
2217
2218 for (int i = 0; i < 4; i++) {
2219 if (dst.writemask & (1 << i)) {
2220 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2221 break;
2222 }
2223 }
2224
2225 /* Swizzle a small RHS vector into the channels being written.
2226 *
2227 * glsl ir treats write_mask as dictating how many channels are
2228 * present on the RHS while in our instructions we need to make
2229 * those channels appear in the slots of the vec4 they're written to.
2230 */
2231 for (int i = 0; i < 4; i++) {
2232 if (dst.writemask & (1 << i))
2233 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2234 else
2235 swizzles[i] = first_enabled_chan;
2236 }
2237 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2238 swizzles[2], swizzles[3]);
2239
2240 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2241 return;
2242 }
2243
2244 if (ir->condition) {
2245 emit_bool_to_cond_code(ir->condition, &predicate);
2246 }
2247
2248 for (i = 0; i < type_size(ir->lhs->type); i++) {
2249 vec4_instruction *inst = emit(MOV(dst, src));
2250 inst->predicate = predicate;
2251
2252 dst.reg_offset++;
2253 src.reg_offset++;
2254 }
2255 }
2256
2257 void
2258 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2259 {
2260 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2261 foreach_in_list(ir_constant, field_value, &ir->components) {
2262 emit_constant_values(dst, field_value);
2263 }
2264 return;
2265 }
2266
2267 if (ir->type->is_array()) {
2268 for (unsigned int i = 0; i < ir->type->length; i++) {
2269 emit_constant_values(dst, ir->array_elements[i]);
2270 }
2271 return;
2272 }
2273
2274 if (ir->type->is_matrix()) {
2275 for (int i = 0; i < ir->type->matrix_columns; i++) {
2276 float *vec = &ir->value.f[i * ir->type->vector_elements];
2277
2278 for (int j = 0; j < ir->type->vector_elements; j++) {
2279 dst->writemask = 1 << j;
2280 dst->type = BRW_REGISTER_TYPE_F;
2281
2282 emit(MOV(*dst, src_reg(vec[j])));
2283 }
2284 dst->reg_offset++;
2285 }
2286 return;
2287 }
2288
2289 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2290
2291 for (int i = 0; i < ir->type->vector_elements; i++) {
2292 if (!(remaining_writemask & (1 << i)))
2293 continue;
2294
2295 dst->writemask = 1 << i;
2296 dst->type = brw_type_for_base_type(ir->type);
2297
2298 /* Find other components that match the one we're about to
2299 * write. Emits fewer instructions for things like vec4(0.5,
2300 * 1.5, 1.5, 1.5).
2301 */
2302 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2303 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2304 if (ir->value.b[i] == ir->value.b[j])
2305 dst->writemask |= (1 << j);
2306 } else {
2307 /* u, i, and f storage all line up, so no need for a
2308 * switch case for comparing each type.
2309 */
2310 if (ir->value.u[i] == ir->value.u[j])
2311 dst->writemask |= (1 << j);
2312 }
2313 }
2314
2315 switch (ir->type->base_type) {
2316 case GLSL_TYPE_FLOAT:
2317 emit(MOV(*dst, src_reg(ir->value.f[i])));
2318 break;
2319 case GLSL_TYPE_INT:
2320 emit(MOV(*dst, src_reg(ir->value.i[i])));
2321 break;
2322 case GLSL_TYPE_UINT:
2323 emit(MOV(*dst, src_reg(ir->value.u[i])));
2324 break;
2325 case GLSL_TYPE_BOOL:
2326 emit(MOV(*dst,
2327 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2328 : 0)));
2329 break;
2330 default:
2331 unreachable("Non-float/uint/int/bool constant");
2332 }
2333
2334 remaining_writemask &= ~dst->writemask;
2335 }
2336 dst->reg_offset++;
2337 }
2338
2339 void
2340 vec4_visitor::visit(ir_constant *ir)
2341 {
2342 dst_reg dst = dst_reg(this, ir->type);
2343 this->result = src_reg(dst);
2344
2345 emit_constant_values(&dst, ir);
2346 }
2347
2348 void
2349 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2350 {
2351 ir_dereference *deref = static_cast<ir_dereference *>(
2352 ir->actual_parameters.get_head());
2353 ir_variable *location = deref->variable_referenced();
2354 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2355 location->data.binding);
2356
2357 /* Calculate the surface offset */
2358 src_reg offset(this, glsl_type::uint_type);
2359 ir_dereference_array *deref_array = deref->as_dereference_array();
2360 if (deref_array) {
2361 deref_array->array_index->accept(this);
2362
2363 src_reg tmp(this, glsl_type::uint_type);
2364 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2365 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2366 } else {
2367 offset = location->data.atomic.offset;
2368 }
2369
2370 /* Emit the appropriate machine instruction */
2371 const char *callee = ir->callee->function_name();
2372 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2373
2374 if (!strcmp("__intrinsic_atomic_read", callee)) {
2375 emit_untyped_surface_read(surf_index, dst, offset);
2376
2377 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2378 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2379 src_reg(), src_reg());
2380
2381 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2382 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2383 src_reg(), src_reg());
2384 }
2385 }
2386
2387 void
2388 vec4_visitor::visit(ir_call *ir)
2389 {
2390 const char *callee = ir->callee->function_name();
2391
2392 if (!strcmp("__intrinsic_atomic_read", callee) ||
2393 !strcmp("__intrinsic_atomic_increment", callee) ||
2394 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2395 visit_atomic_counter_intrinsic(ir);
2396 } else {
2397 unreachable("Unsupported intrinsic.");
2398 }
2399 }
2400
2401 src_reg
2402 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2403 {
2404 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2405 inst->base_mrf = 2;
2406 inst->mlen = 1;
2407 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2408 inst->dst.writemask = WRITEMASK_XYZW;
2409
2410 inst->src[1] = sampler;
2411
2412 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2413 int param_base = inst->base_mrf;
2414 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2415 int zero_mask = 0xf & ~coord_mask;
2416
2417 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2418 coordinate));
2419
2420 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2421 src_reg(0)));
2422
2423 emit(inst);
2424 return src_reg(inst->dst);
2425 }
2426
2427 static bool
2428 is_high_sampler(struct brw_context *brw, src_reg sampler)
2429 {
2430 if (brw->gen < 8 && !brw->is_haswell)
2431 return false;
2432
2433 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2434 }
2435
2436 void
2437 vec4_visitor::visit(ir_texture *ir)
2438 {
2439 uint32_t sampler =
2440 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2441
2442 ir_rvalue *nonconst_sampler_index =
2443 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2444
2445 /* Handle non-constant sampler array indexing */
2446 src_reg sampler_reg;
2447 if (nonconst_sampler_index) {
2448 /* The highest sampler which may be used by this operation is
2449 * the last element of the array. Mark it here, because the generator
2450 * doesn't have enough information to determine the bound.
2451 */
2452 uint32_t array_size = ir->sampler->as_dereference_array()
2453 ->array->type->array_size();
2454
2455 uint32_t max_used = sampler + array_size - 1;
2456 if (ir->op == ir_tg4 && brw->gen < 8) {
2457 max_used += prog_data->base.binding_table.gather_texture_start;
2458 } else {
2459 max_used += prog_data->base.binding_table.texture_start;
2460 }
2461
2462 brw_mark_surface_used(&prog_data->base, max_used);
2463
2464 /* Emit code to evaluate the actual indexing expression */
2465 nonconst_sampler_index->accept(this);
2466 dst_reg temp(this, glsl_type::uint_type);
2467 emit(ADD(temp, this->result, src_reg(sampler)))
2468 ->force_writemask_all = true;
2469 sampler_reg = src_reg(temp);
2470 } else {
2471 /* Single sampler, or constant array index; the indexing expression
2472 * is just an immediate.
2473 */
2474 sampler_reg = src_reg(sampler);
2475 }
2476
2477 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2478 * emitting anything other than setting up the constant result.
2479 */
2480 if (ir->op == ir_tg4) {
2481 ir_constant *chan = ir->lod_info.component->as_constant();
2482 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2483 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2484 dst_reg result(this, ir->type);
2485 this->result = src_reg(result);
2486 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2487 return;
2488 }
2489 }
2490
2491 /* Should be lowered by do_lower_texture_projection */
2492 assert(!ir->projector);
2493
2494 /* Should be lowered */
2495 assert(!ir->offset || !ir->offset->type->is_array());
2496
2497 /* Generate code to compute all the subexpression trees. This has to be
2498 * done before loading any values into MRFs for the sampler message since
2499 * generating these values may involve SEND messages that need the MRFs.
2500 */
2501 src_reg coordinate;
2502 if (ir->coordinate) {
2503 ir->coordinate->accept(this);
2504 coordinate = this->result;
2505 }
2506
2507 src_reg shadow_comparitor;
2508 if (ir->shadow_comparitor) {
2509 ir->shadow_comparitor->accept(this);
2510 shadow_comparitor = this->result;
2511 }
2512
2513 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2514 src_reg offset_value;
2515 if (has_nonconstant_offset) {
2516 ir->offset->accept(this);
2517 offset_value = src_reg(this->result);
2518 }
2519
2520 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2521 src_reg lod, dPdx, dPdy, sample_index, mcs;
2522 switch (ir->op) {
2523 case ir_tex:
2524 lod = src_reg(0.0f);
2525 lod_type = glsl_type::float_type;
2526 break;
2527 case ir_txf:
2528 case ir_txl:
2529 case ir_txs:
2530 ir->lod_info.lod->accept(this);
2531 lod = this->result;
2532 lod_type = ir->lod_info.lod->type;
2533 break;
2534 case ir_query_levels:
2535 lod = src_reg(0);
2536 lod_type = glsl_type::int_type;
2537 break;
2538 case ir_txf_ms:
2539 ir->lod_info.sample_index->accept(this);
2540 sample_index = this->result;
2541 sample_index_type = ir->lod_info.sample_index->type;
2542
2543 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2544 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2545 else
2546 mcs = src_reg(0u);
2547 break;
2548 case ir_txd:
2549 ir->lod_info.grad.dPdx->accept(this);
2550 dPdx = this->result;
2551
2552 ir->lod_info.grad.dPdy->accept(this);
2553 dPdy = this->result;
2554
2555 lod_type = ir->lod_info.grad.dPdx->type;
2556 break;
2557 case ir_txb:
2558 case ir_lod:
2559 case ir_tg4:
2560 break;
2561 }
2562
2563 enum opcode opcode;
2564 switch (ir->op) {
2565 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2566 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2567 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2568 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2569 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2570 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2571 case ir_tg4: opcode = has_nonconstant_offset
2572 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2573 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2574 case ir_txb:
2575 unreachable("TXB is not valid for vertex shaders.");
2576 case ir_lod:
2577 unreachable("LOD is not valid for vertex shaders.");
2578 default:
2579 unreachable("Unrecognized tex op");
2580 }
2581
2582 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2583
2584 if (ir->offset != NULL && !has_nonconstant_offset) {
2585 inst->offset =
2586 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2587 ir->offset->type->vector_elements);
2588 }
2589
2590 /* Stuff the channel select bits in the top of the texture offset */
2591 if (ir->op == ir_tg4)
2592 inst->offset |= gather_channel(ir, sampler) << 16;
2593
2594 /* The message header is necessary for:
2595 * - Gen4 (always)
2596 * - Gen9+ for selecting SIMD4x2
2597 * - Texel offsets
2598 * - Gather channel selection
2599 * - Sampler indices too large to fit in a 4-bit value.
2600 */
2601 inst->header_present =
2602 brw->gen < 5 || brw->gen >= 9 ||
2603 inst->offset != 0 || ir->op == ir_tg4 ||
2604 is_high_sampler(brw, sampler_reg);
2605 inst->base_mrf = 2;
2606 inst->mlen = inst->header_present + 1; /* always at least one */
2607 inst->dst = dst_reg(this, ir->type);
2608 inst->dst.writemask = WRITEMASK_XYZW;
2609 inst->shadow_compare = ir->shadow_comparitor != NULL;
2610
2611 inst->src[1] = sampler_reg;
2612
2613 /* MRF for the first parameter */
2614 int param_base = inst->base_mrf + inst->header_present;
2615
2616 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2617 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2618 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2619 } else {
2620 /* Load the coordinate */
2621 /* FINISHME: gl_clamp_mask and saturate */
2622 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2623 int zero_mask = 0xf & ~coord_mask;
2624
2625 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2626 coordinate));
2627
2628 if (zero_mask != 0) {
2629 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2630 src_reg(0)));
2631 }
2632 /* Load the shadow comparitor */
2633 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2634 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2635 WRITEMASK_X),
2636 shadow_comparitor));
2637 inst->mlen++;
2638 }
2639
2640 /* Load the LOD info */
2641 if (ir->op == ir_tex || ir->op == ir_txl) {
2642 int mrf, writemask;
2643 if (brw->gen >= 5) {
2644 mrf = param_base + 1;
2645 if (ir->shadow_comparitor) {
2646 writemask = WRITEMASK_Y;
2647 /* mlen already incremented */
2648 } else {
2649 writemask = WRITEMASK_X;
2650 inst->mlen++;
2651 }
2652 } else /* brw->gen == 4 */ {
2653 mrf = param_base;
2654 writemask = WRITEMASK_W;
2655 }
2656 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2657 } else if (ir->op == ir_txf) {
2658 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2659 } else if (ir->op == ir_txf_ms) {
2660 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2661 sample_index));
2662 if (brw->gen >= 7) {
2663 /* MCS data is in the first channel of `mcs`, but we need to get it into
2664 * the .y channel of the second vec4 of params, so replicate .x across
2665 * the whole vec4 and then mask off everything except .y
2666 */
2667 mcs.swizzle = BRW_SWIZZLE_XXXX;
2668 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2669 mcs));
2670 }
2671 inst->mlen++;
2672 } else if (ir->op == ir_txd) {
2673 const glsl_type *type = lod_type;
2674
2675 if (brw->gen >= 5) {
2676 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2677 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2678 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2679 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2680 inst->mlen++;
2681
2682 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2683 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2684 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2685 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2686 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2687 inst->mlen++;
2688
2689 if (ir->shadow_comparitor) {
2690 emit(MOV(dst_reg(MRF, param_base + 2,
2691 ir->shadow_comparitor->type, WRITEMASK_Z),
2692 shadow_comparitor));
2693 }
2694 }
2695 } else /* brw->gen == 4 */ {
2696 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2697 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2698 inst->mlen += 2;
2699 }
2700 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2701 if (ir->shadow_comparitor) {
2702 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2703 shadow_comparitor));
2704 }
2705
2706 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2707 offset_value));
2708 inst->mlen++;
2709 }
2710 }
2711
2712 emit(inst);
2713
2714 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2715 * spec requires layers.
2716 */
2717 if (ir->op == ir_txs) {
2718 glsl_type const *type = ir->sampler->type;
2719 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2720 type->sampler_array) {
2721 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2722 writemask(inst->dst, WRITEMASK_Z),
2723 src_reg(inst->dst), src_reg(6));
2724 }
2725 }
2726
2727 if (brw->gen == 6 && ir->op == ir_tg4) {
2728 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2729 }
2730
2731 swizzle_result(ir, src_reg(inst->dst), sampler);
2732 }
2733
2734 /**
2735 * Apply workarounds for Gen6 gather with UINT/SINT
2736 */
2737 void
2738 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2739 {
2740 if (!wa)
2741 return;
2742
2743 int width = (wa & WA_8BIT) ? 8 : 16;
2744 dst_reg dst_f = dst;
2745 dst_f.type = BRW_REGISTER_TYPE_F;
2746
2747 /* Convert from UNORM to UINT */
2748 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2749 emit(MOV(dst, src_reg(dst_f)));
2750
2751 if (wa & WA_SIGN) {
2752 /* Reinterpret the UINT value as a signed INT value by
2753 * shifting the sign bit into place, then shifting back
2754 * preserving sign.
2755 */
2756 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2757 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2758 }
2759 }
2760
2761 /**
2762 * Set up the gather channel based on the swizzle, for gather4.
2763 */
2764 uint32_t
2765 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2766 {
2767 ir_constant *chan = ir->lod_info.component->as_constant();
2768 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2769 switch (swiz) {
2770 case SWIZZLE_X: return 0;
2771 case SWIZZLE_Y:
2772 /* gather4 sampler is broken for green channel on RG32F --
2773 * we must ask for blue instead.
2774 */
2775 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2776 return 2;
2777 return 1;
2778 case SWIZZLE_Z: return 2;
2779 case SWIZZLE_W: return 3;
2780 default:
2781 unreachable("Not reached"); /* zero, one swizzles handled already */
2782 }
2783 }
2784
2785 void
2786 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2787 {
2788 int s = key->tex.swizzles[sampler];
2789
2790 this->result = src_reg(this, ir->type);
2791 dst_reg swizzled_result(this->result);
2792
2793 if (ir->op == ir_query_levels) {
2794 /* # levels is in .w */
2795 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2796 emit(MOV(swizzled_result, orig_val));
2797 return;
2798 }
2799
2800 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2801 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2802 emit(MOV(swizzled_result, orig_val));
2803 return;
2804 }
2805
2806
2807 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2808 int swizzle[4] = {0};
2809
2810 for (int i = 0; i < 4; i++) {
2811 switch (GET_SWZ(s, i)) {
2812 case SWIZZLE_ZERO:
2813 zero_mask |= (1 << i);
2814 break;
2815 case SWIZZLE_ONE:
2816 one_mask |= (1 << i);
2817 break;
2818 default:
2819 copy_mask |= (1 << i);
2820 swizzle[i] = GET_SWZ(s, i);
2821 break;
2822 }
2823 }
2824
2825 if (copy_mask) {
2826 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2827 swizzled_result.writemask = copy_mask;
2828 emit(MOV(swizzled_result, orig_val));
2829 }
2830
2831 if (zero_mask) {
2832 swizzled_result.writemask = zero_mask;
2833 emit(MOV(swizzled_result, src_reg(0.0f)));
2834 }
2835
2836 if (one_mask) {
2837 swizzled_result.writemask = one_mask;
2838 emit(MOV(swizzled_result, src_reg(1.0f)));
2839 }
2840 }
2841
2842 void
2843 vec4_visitor::visit(ir_return *)
2844 {
2845 unreachable("not reached");
2846 }
2847
2848 void
2849 vec4_visitor::visit(ir_discard *)
2850 {
2851 unreachable("not reached");
2852 }
2853
2854 void
2855 vec4_visitor::visit(ir_if *ir)
2856 {
2857 /* Don't point the annotation at the if statement, because then it plus
2858 * the then and else blocks get printed.
2859 */
2860 this->base_ir = ir->condition;
2861
2862 if (brw->gen == 6) {
2863 emit_if_gen6(ir);
2864 } else {
2865 enum brw_predicate predicate;
2866 emit_bool_to_cond_code(ir->condition, &predicate);
2867 emit(IF(predicate));
2868 }
2869
2870 visit_instructions(&ir->then_instructions);
2871
2872 if (!ir->else_instructions.is_empty()) {
2873 this->base_ir = ir->condition;
2874 emit(BRW_OPCODE_ELSE);
2875
2876 visit_instructions(&ir->else_instructions);
2877 }
2878
2879 this->base_ir = ir->condition;
2880 emit(BRW_OPCODE_ENDIF);
2881 }
2882
2883 void
2884 vec4_visitor::visit(ir_emit_vertex *)
2885 {
2886 unreachable("not reached");
2887 }
2888
2889 void
2890 vec4_visitor::visit(ir_end_primitive *)
2891 {
2892 unreachable("not reached");
2893 }
2894
2895 void
2896 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2897 dst_reg dst, src_reg offset,
2898 src_reg src0, src_reg src1)
2899 {
2900 unsigned mlen = 0;
2901
2902 /* Set the atomic operation offset. */
2903 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2904 mlen++;
2905
2906 /* Set the atomic operation arguments. */
2907 if (src0.file != BAD_FILE) {
2908 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2909 mlen++;
2910 }
2911
2912 if (src1.file != BAD_FILE) {
2913 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2914 mlen++;
2915 }
2916
2917 /* Emit the instruction. Note that this maps to the normal SIMD8
2918 * untyped atomic message on Ivy Bridge, but that's OK because
2919 * unused channels will be masked out.
2920 */
2921 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2922 src_reg(atomic_op), src_reg(surf_index));
2923 inst->base_mrf = 0;
2924 inst->mlen = mlen;
2925 }
2926
2927 void
2928 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2929 src_reg offset)
2930 {
2931 /* Set the surface read offset. */
2932 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2933
2934 /* Emit the instruction. Note that this maps to the normal SIMD8
2935 * untyped surface read message, but that's OK because unused
2936 * channels will be masked out.
2937 */
2938 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2939 dst, src_reg(surf_index));
2940 inst->base_mrf = 0;
2941 inst->mlen = 1;
2942 }
2943
2944 void
2945 vec4_visitor::emit_ndc_computation()
2946 {
2947 /* Get the position */
2948 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2949
2950 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2951 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2952 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2953
2954 current_annotation = "NDC";
2955 dst_reg ndc_w = ndc;
2956 ndc_w.writemask = WRITEMASK_W;
2957 src_reg pos_w = pos;
2958 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2959 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2960
2961 dst_reg ndc_xyz = ndc;
2962 ndc_xyz.writemask = WRITEMASK_XYZ;
2963
2964 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2965 }
2966
2967 void
2968 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2969 {
2970 if (brw->gen < 6 &&
2971 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2972 key->userclip_active || brw->has_negative_rhw_bug)) {
2973 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2974 dst_reg header1_w = header1;
2975 header1_w.writemask = WRITEMASK_W;
2976
2977 emit(MOV(header1, 0u));
2978
2979 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2980 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2981
2982 current_annotation = "Point size";
2983 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2984 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2985 }
2986
2987 if (key->userclip_active) {
2988 current_annotation = "Clipping flags";
2989 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2990 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2991
2992 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2993 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2994 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2995
2996 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2997 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2998 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2999 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3000 }
3001
3002 /* i965 clipping workaround:
3003 * 1) Test for -ve rhw
3004 * 2) If set,
3005 * set ndc = (0,0,0,0)
3006 * set ucp[6] = 1
3007 *
3008 * Later, clipping will detect ucp[6] and ensure the primitive is
3009 * clipped against all fixed planes.
3010 */
3011 if (brw->has_negative_rhw_bug) {
3012 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3013 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3014 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3015 vec4_instruction *inst;
3016 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3017 inst->predicate = BRW_PREDICATE_NORMAL;
3018 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3019 inst->predicate = BRW_PREDICATE_NORMAL;
3020 }
3021
3022 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3023 } else if (brw->gen < 6) {
3024 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3025 } else {
3026 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3027 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3028 dst_reg reg_w = reg;
3029 reg_w.writemask = WRITEMASK_W;
3030 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3031 }
3032 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3033 dst_reg reg_y = reg;
3034 reg_y.writemask = WRITEMASK_Y;
3035 reg_y.type = BRW_REGISTER_TYPE_D;
3036 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3037 }
3038 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3039 dst_reg reg_z = reg;
3040 reg_z.writemask = WRITEMASK_Z;
3041 reg_z.type = BRW_REGISTER_TYPE_D;
3042 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3043 }
3044 }
3045 }
3046
3047 void
3048 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3049 {
3050 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3051 *
3052 * "If a linked set of shaders forming the vertex stage contains no
3053 * static write to gl_ClipVertex or gl_ClipDistance, but the
3054 * application has requested clipping against user clip planes through
3055 * the API, then the coordinate written to gl_Position is used for
3056 * comparison against the user clip planes."
3057 *
3058 * This function is only called if the shader didn't write to
3059 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3060 * if the user wrote to it; otherwise we use gl_Position.
3061 */
3062 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3063 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3064 clip_vertex = VARYING_SLOT_POS;
3065 }
3066
3067 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3068 ++i) {
3069 reg.writemask = 1 << i;
3070 emit(DP4(reg,
3071 src_reg(output_reg[clip_vertex]),
3072 src_reg(this->userplane[i + offset])));
3073 }
3074 }
3075
3076 vec4_instruction *
3077 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3078 {
3079 assert (varying < VARYING_SLOT_MAX);
3080 reg.type = output_reg[varying].type;
3081 current_annotation = output_reg_annotation[varying];
3082 /* Copy the register, saturating if necessary */
3083 return emit(MOV(reg, src_reg(output_reg[varying])));
3084 }
3085
3086 void
3087 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3088 {
3089 reg.type = BRW_REGISTER_TYPE_F;
3090
3091 switch (varying) {
3092 case VARYING_SLOT_PSIZ:
3093 {
3094 /* PSIZ is always in slot 0, and is coupled with other flags. */
3095 current_annotation = "indices, point width, clip flags";
3096 emit_psiz_and_flags(reg);
3097 break;
3098 }
3099 case BRW_VARYING_SLOT_NDC:
3100 current_annotation = "NDC";
3101 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3102 break;
3103 case VARYING_SLOT_POS:
3104 current_annotation = "gl_Position";
3105 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3106 break;
3107 case VARYING_SLOT_EDGE:
3108 /* This is present when doing unfilled polygons. We're supposed to copy
3109 * the edge flag from the user-provided vertex array
3110 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3111 * of that attribute (starts as 1.0f). This is then used in clipping to
3112 * determine which edges should be drawn as wireframe.
3113 */
3114 current_annotation = "edge flag";
3115 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3116 glsl_type::float_type, WRITEMASK_XYZW))));
3117 break;
3118 case BRW_VARYING_SLOT_PAD:
3119 /* No need to write to this slot */
3120 break;
3121 case VARYING_SLOT_COL0:
3122 case VARYING_SLOT_COL1:
3123 case VARYING_SLOT_BFC0:
3124 case VARYING_SLOT_BFC1: {
3125 /* These built-in varyings are only supported in compatibility mode,
3126 * and we only support GS in core profile. So, this must be a vertex
3127 * shader.
3128 */
3129 assert(stage == MESA_SHADER_VERTEX);
3130 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3131 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3132 inst->saturate = true;
3133 break;
3134 }
3135
3136 default:
3137 emit_generic_urb_slot(reg, varying);
3138 break;
3139 }
3140 }
3141
3142 static int
3143 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3144 {
3145 if (brw->gen >= 6) {
3146 /* URB data written (does not include the message header reg) must
3147 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3148 * section 5.4.3.2.2: URB_INTERLEAVED.
3149 *
3150 * URB entries are allocated on a multiple of 1024 bits, so an
3151 * extra 128 bits written here to make the end align to 256 is
3152 * no problem.
3153 */
3154 if ((mlen % 2) != 1)
3155 mlen++;
3156 }
3157
3158 return mlen;
3159 }
3160
3161
3162 /**
3163 * Generates the VUE payload plus the necessary URB write instructions to
3164 * output it.
3165 *
3166 * The VUE layout is documented in Volume 2a.
3167 */
3168 void
3169 vec4_visitor::emit_vertex()
3170 {
3171 /* MRF 0 is reserved for the debugger, so start with message header
3172 * in MRF 1.
3173 */
3174 int base_mrf = 1;
3175 int mrf = base_mrf;
3176 /* In the process of generating our URB write message contents, we
3177 * may need to unspill a register or load from an array. Those
3178 * reads would use MRFs 14-15.
3179 */
3180 int max_usable_mrf = 13;
3181
3182 /* The following assertion verifies that max_usable_mrf causes an
3183 * even-numbered amount of URB write data, which will meet gen6's
3184 * requirements for length alignment.
3185 */
3186 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3187
3188 /* First mrf is the g0-based message header containing URB handles and
3189 * such.
3190 */
3191 emit_urb_write_header(mrf++);
3192
3193 if (brw->gen < 6) {
3194 emit_ndc_computation();
3195 }
3196
3197 /* Lower legacy ff and ClipVertex clipping to clip distances */
3198 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3199 current_annotation = "user clip distances";
3200
3201 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3202 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3203
3204 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3205 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3206 }
3207
3208 /* We may need to split this up into several URB writes, so do them in a
3209 * loop.
3210 */
3211 int slot = 0;
3212 bool complete = false;
3213 do {
3214 /* URB offset is in URB row increments, and each of our MRFs is half of
3215 * one of those, since we're doing interleaved writes.
3216 */
3217 int offset = slot / 2;
3218
3219 mrf = base_mrf + 1;
3220 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3221 emit_urb_slot(dst_reg(MRF, mrf++),
3222 prog_data->vue_map.slot_to_varying[slot]);
3223
3224 /* If this was max_usable_mrf, we can't fit anything more into this
3225 * URB WRITE.
3226 */
3227 if (mrf > max_usable_mrf) {
3228 slot++;
3229 break;
3230 }
3231 }
3232
3233 complete = slot >= prog_data->vue_map.num_slots;
3234 current_annotation = "URB write";
3235 vec4_instruction *inst = emit_urb_write_opcode(complete);
3236 inst->base_mrf = base_mrf;
3237 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3238 inst->offset += offset;
3239 } while(!complete);
3240 }
3241
3242
3243 src_reg
3244 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3245 src_reg *reladdr, int reg_offset)
3246 {
3247 /* Because we store the values to scratch interleaved like our
3248 * vertex data, we need to scale the vec4 index by 2.
3249 */
3250 int message_header_scale = 2;
3251
3252 /* Pre-gen6, the message header uses byte offsets instead of vec4
3253 * (16-byte) offset units.
3254 */
3255 if (brw->gen < 6)
3256 message_header_scale *= 16;
3257
3258 if (reladdr) {
3259 src_reg index = src_reg(this, glsl_type::int_type);
3260
3261 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3262 src_reg(reg_offset)));
3263 emit_before(block, inst, MUL(dst_reg(index), index,
3264 src_reg(message_header_scale)));
3265
3266 return index;
3267 } else {
3268 return src_reg(reg_offset * message_header_scale);
3269 }
3270 }
3271
3272 src_reg
3273 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3274 src_reg *reladdr, int reg_offset)
3275 {
3276 if (reladdr) {
3277 src_reg index = src_reg(this, glsl_type::int_type);
3278
3279 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3280 src_reg(reg_offset)));
3281
3282 /* Pre-gen6, the message header uses byte offsets instead of vec4
3283 * (16-byte) offset units.
3284 */
3285 if (brw->gen < 6) {
3286 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3287 }
3288
3289 return index;
3290 } else if (brw->gen >= 8) {
3291 /* Store the offset in a GRF so we can send-from-GRF. */
3292 src_reg offset = src_reg(this, glsl_type::int_type);
3293 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3294 return offset;
3295 } else {
3296 int message_header_scale = brw->gen < 6 ? 16 : 1;
3297 return src_reg(reg_offset * message_header_scale);
3298 }
3299 }
3300
3301 /**
3302 * Emits an instruction before @inst to load the value named by @orig_src
3303 * from scratch space at @base_offset to @temp.
3304 *
3305 * @base_offset is measured in 32-byte units (the size of a register).
3306 */
3307 void
3308 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3309 dst_reg temp, src_reg orig_src,
3310 int base_offset)
3311 {
3312 int reg_offset = base_offset + orig_src.reg_offset;
3313 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3314 reg_offset);
3315
3316 emit_before(block, inst, SCRATCH_READ(temp, index));
3317 }
3318
3319 /**
3320 * Emits an instruction after @inst to store the value to be written
3321 * to @orig_dst to scratch space at @base_offset, from @temp.
3322 *
3323 * @base_offset is measured in 32-byte units (the size of a register).
3324 */
3325 void
3326 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3327 int base_offset)
3328 {
3329 int reg_offset = base_offset + inst->dst.reg_offset;
3330 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3331 reg_offset);
3332
3333 /* Create a temporary register to store *inst's result in.
3334 *
3335 * We have to be careful in MOVing from our temporary result register in
3336 * the scratch write. If we swizzle from channels of the temporary that
3337 * weren't initialized, it will confuse live interval analysis, which will
3338 * make spilling fail to make progress.
3339 */
3340 src_reg temp = src_reg(this, glsl_type::vec4_type);
3341 temp.type = inst->dst.type;
3342 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3343 int swizzles[4];
3344 for (int i = 0; i < 4; i++)
3345 if (inst->dst.writemask & (1 << i))
3346 swizzles[i] = i;
3347 else
3348 swizzles[i] = first_writemask_chan;
3349 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3350 swizzles[2], swizzles[3]);
3351
3352 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3353 inst->dst.writemask));
3354 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3355 write->predicate = inst->predicate;
3356 write->ir = inst->ir;
3357 write->annotation = inst->annotation;
3358 inst->insert_after(block, write);
3359
3360 inst->dst.file = temp.file;
3361 inst->dst.reg = temp.reg;
3362 inst->dst.reg_offset = temp.reg_offset;
3363 inst->dst.reladdr = NULL;
3364 }
3365
3366 /**
3367 * We can't generally support array access in GRF space, because a
3368 * single instruction's destination can only span 2 contiguous
3369 * registers. So, we send all GRF arrays that get variable index
3370 * access to scratch space.
3371 */
3372 void
3373 vec4_visitor::move_grf_array_access_to_scratch()
3374 {
3375 int scratch_loc[this->virtual_grf_count];
3376 memset(scratch_loc, -1, sizeof(scratch_loc));
3377
3378 /* First, calculate the set of virtual GRFs that need to be punted
3379 * to scratch due to having any array access on them, and where in
3380 * scratch.
3381 */
3382 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3383 if (inst->dst.file == GRF && inst->dst.reladdr &&
3384 scratch_loc[inst->dst.reg] == -1) {
3385 scratch_loc[inst->dst.reg] = c->last_scratch;
3386 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3387 }
3388
3389 for (int i = 0 ; i < 3; i++) {
3390 src_reg *src = &inst->src[i];
3391
3392 if (src->file == GRF && src->reladdr &&
3393 scratch_loc[src->reg] == -1) {
3394 scratch_loc[src->reg] = c->last_scratch;
3395 c->last_scratch += this->virtual_grf_sizes[src->reg];
3396 }
3397 }
3398 }
3399
3400 /* Now, for anything that will be accessed through scratch, rewrite
3401 * it to load/store. Note that this is a _safe list walk, because
3402 * we may generate a new scratch_write instruction after the one
3403 * we're processing.
3404 */
3405 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3406 /* Set up the annotation tracking for new generated instructions. */
3407 base_ir = inst->ir;
3408 current_annotation = inst->annotation;
3409
3410 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3411 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3412 }
3413
3414 for (int i = 0 ; i < 3; i++) {
3415 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3416 continue;
3417
3418 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3419
3420 emit_scratch_read(block, inst, temp, inst->src[i],
3421 scratch_loc[inst->src[i].reg]);
3422
3423 inst->src[i].file = temp.file;
3424 inst->src[i].reg = temp.reg;
3425 inst->src[i].reg_offset = temp.reg_offset;
3426 inst->src[i].reladdr = NULL;
3427 }
3428 }
3429 }
3430
3431 /**
3432 * Emits an instruction before @inst to load the value named by @orig_src
3433 * from the pull constant buffer (surface) at @base_offset to @temp.
3434 */
3435 void
3436 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3437 dst_reg temp, src_reg orig_src,
3438 int base_offset)
3439 {
3440 int reg_offset = base_offset + orig_src.reg_offset;
3441 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3442 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3443 reg_offset);
3444 vec4_instruction *load;
3445
3446 if (brw->gen >= 7) {
3447 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3448 grf_offset.type = offset.type;
3449 emit_before(block, inst, MOV(grf_offset, offset));
3450
3451 load = new(mem_ctx) vec4_instruction(this,
3452 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3453 temp, index, src_reg(grf_offset));
3454 } else {
3455 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3456 temp, index, offset);
3457 load->base_mrf = 14;
3458 load->mlen = 1;
3459 }
3460 emit_before(block, inst, load);
3461 }
3462
3463 /**
3464 * Implements array access of uniforms by inserting a
3465 * PULL_CONSTANT_LOAD instruction.
3466 *
3467 * Unlike temporary GRF array access (where we don't support it due to
3468 * the difficulty of doing relative addressing on instruction
3469 * destinations), we could potentially do array access of uniforms
3470 * that were loaded in GRF space as push constants. In real-world
3471 * usage we've seen, though, the arrays being used are always larger
3472 * than we could load as push constants, so just always move all
3473 * uniform array access out to a pull constant buffer.
3474 */
3475 void
3476 vec4_visitor::move_uniform_array_access_to_pull_constants()
3477 {
3478 int pull_constant_loc[this->uniforms];
3479 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3480 bool nested_reladdr;
3481
3482 /* Walk through and find array access of uniforms. Put a copy of that
3483 * uniform in the pull constant buffer.
3484 *
3485 * Note that we don't move constant-indexed accesses to arrays. No
3486 * testing has been done of the performance impact of this choice.
3487 */
3488 do {
3489 nested_reladdr = false;
3490
3491 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3492 for (int i = 0 ; i < 3; i++) {
3493 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3494 continue;
3495
3496 int uniform = inst->src[i].reg;
3497
3498 if (inst->src[i].reladdr->reladdr)
3499 nested_reladdr = true; /* will need another pass */
3500
3501 /* If this array isn't already present in the pull constant buffer,
3502 * add it.
3503 */
3504 if (pull_constant_loc[uniform] == -1) {
3505 const gl_constant_value **values =
3506 &stage_prog_data->param[uniform * 4];
3507
3508 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3509
3510 assert(uniform < uniform_array_size);
3511 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3512 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3513 = values[j];
3514 }
3515 }
3516
3517 /* Set up the annotation tracking for new generated instructions. */
3518 base_ir = inst->ir;
3519 current_annotation = inst->annotation;
3520
3521 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3522
3523 emit_pull_constant_load(block, inst, temp, inst->src[i],
3524 pull_constant_loc[uniform]);
3525
3526 inst->src[i].file = temp.file;
3527 inst->src[i].reg = temp.reg;
3528 inst->src[i].reg_offset = temp.reg_offset;
3529 inst->src[i].reladdr = NULL;
3530 }
3531 }
3532 } while (nested_reladdr);
3533
3534 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3535 * no need to track them as larger-than-vec4 objects. This will be
3536 * relied on in cutting out unused uniform vectors from push
3537 * constants.
3538 */
3539 split_uniform_registers();
3540 }
3541
3542 void
3543 vec4_visitor::resolve_ud_negate(src_reg *reg)
3544 {
3545 if (reg->type != BRW_REGISTER_TYPE_UD ||
3546 !reg->negate)
3547 return;
3548
3549 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3550 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3551 *reg = temp;
3552 }
3553
3554 /**
3555 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3556 *
3557 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3558 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3559 */
3560 void
3561 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3562 {
3563 assert(brw->gen <= 5);
3564
3565 if (!rvalue->type->is_boolean())
3566 return;
3567
3568 src_reg and_result = src_reg(this, rvalue->type);
3569 src_reg neg_result = src_reg(this, rvalue->type);
3570 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3571 emit(MOV(dst_reg(neg_result), negate(and_result)));
3572 *reg = neg_result;
3573 }
3574
3575 vec4_visitor::vec4_visitor(struct brw_context *brw,
3576 struct brw_vec4_compile *c,
3577 struct gl_program *prog,
3578 const struct brw_vue_prog_key *key,
3579 struct brw_vue_prog_data *prog_data,
3580 struct gl_shader_program *shader_prog,
3581 gl_shader_stage stage,
3582 void *mem_ctx,
3583 bool debug_flag,
3584 bool no_spills,
3585 shader_time_shader_type st_base,
3586 shader_time_shader_type st_written,
3587 shader_time_shader_type st_reset)
3588 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3589 c(c),
3590 key(key),
3591 prog_data(prog_data),
3592 sanity_param_count(0),
3593 fail_msg(NULL),
3594 first_non_payload_grf(0),
3595 need_all_constants_in_pull_buffer(false),
3596 debug_flag(debug_flag),
3597 no_spills(no_spills),
3598 st_base(st_base),
3599 st_written(st_written),
3600 st_reset(st_reset)
3601 {
3602 this->mem_ctx = mem_ctx;
3603 this->failed = false;
3604
3605 this->base_ir = NULL;
3606 this->current_annotation = NULL;
3607 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3608
3609 this->variable_ht = hash_table_ctor(0,
3610 hash_table_pointer_hash,
3611 hash_table_pointer_compare);
3612
3613 this->virtual_grf_start = NULL;
3614 this->virtual_grf_end = NULL;
3615 this->virtual_grf_sizes = NULL;
3616 this->virtual_grf_count = 0;
3617 this->virtual_grf_reg_map = NULL;
3618 this->virtual_grf_reg_count = 0;
3619 this->virtual_grf_array_size = 0;
3620 this->live_intervals = NULL;
3621
3622 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3623
3624 this->uniforms = 0;
3625
3626 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3627 * at least one. See setup_uniforms() in brw_vec4.cpp.
3628 */
3629 this->uniform_array_size = 1;
3630 if (prog_data) {
3631 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3632 }
3633
3634 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3635 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3636 }
3637
3638 vec4_visitor::~vec4_visitor()
3639 {
3640 hash_table_dtor(this->variable_ht);
3641 }
3642
3643
3644 void
3645 vec4_visitor::fail(const char *format, ...)
3646 {
3647 va_list va;
3648 char *msg;
3649
3650 if (failed)
3651 return;
3652
3653 failed = true;
3654
3655 va_start(va, format);
3656 msg = ralloc_vasprintf(mem_ctx, format, va);
3657 va_end(va);
3658 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3659
3660 this->fail_msg = msg;
3661
3662 if (debug_flag) {
3663 fprintf(stderr, "%s", msg);
3664 }
3665 }
3666
3667 } /* namespace brw */