i965: Use ~0 to represent true on all generations.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
70 vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(block, new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
82 const src_reg &src1, const src_reg &src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
91 const src_reg &src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
119 src0); \
120 }
121
122 #define ALU2(op) \
123 vec4_instruction * \
124 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
125 const src_reg &src1) \
126 { \
127 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
128 src0, src1); \
129 }
130
131 #define ALU2_ACC(op) \
132 vec4_instruction * \
133 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
134 const src_reg &src1) \
135 { \
136 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
137 BRW_OPCODE_##op, dst, src0, src1); \
138 inst->writes_accumulator = true; \
139 return inst; \
140 }
141
142 #define ALU3(op) \
143 vec4_instruction * \
144 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
145 const src_reg &src1, const src_reg &src2) \
146 { \
147 assert(brw->gen >= 6); \
148 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
149 src0, src1, src2); \
150 }
151
152 ALU1(NOT)
153 ALU1(MOV)
154 ALU1(FRC)
155 ALU1(RNDD)
156 ALU1(RNDE)
157 ALU1(RNDZ)
158 ALU1(F32TO16)
159 ALU1(F16TO32)
160 ALU2(ADD)
161 ALU2(MUL)
162 ALU2_ACC(MACH)
163 ALU2(AND)
164 ALU2(OR)
165 ALU2(XOR)
166 ALU2(DP3)
167 ALU2(DP4)
168 ALU2(DPH)
169 ALU2(SHL)
170 ALU2(SHR)
171 ALU2(ASR)
172 ALU3(LRP)
173 ALU1(BFREV)
174 ALU3(BFE)
175 ALU2(BFI1)
176 ALU3(BFI2)
177 ALU1(FBH)
178 ALU1(FBL)
179 ALU1(CBIT)
180 ALU3(MAD)
181 ALU2_ACC(ADDC)
182 ALU2_ACC(SUBB)
183 ALU2(MAC)
184
185 /** Gen4 predicated IF. */
186 vec4_instruction *
187 vec4_visitor::IF(enum brw_predicate predicate)
188 {
189 vec4_instruction *inst;
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
192 inst->predicate = predicate;
193
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 vec4_instruction *
199 vec4_visitor::IF(src_reg src0, src_reg src1,
200 enum brw_conditional_mod condition)
201 {
202 assert(brw->gen == 6);
203
204 vec4_instruction *inst;
205
206 resolve_ud_negate(&src0);
207 resolve_ud_negate(&src1);
208
209 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
210 src0, src1);
211 inst->conditional_mod = condition;
212
213 return inst;
214 }
215
216 /**
217 * CMP: Sets the low bit of the destination channels with the result
218 * of the comparison, while the upper bits are undefined, and updates
219 * the flag register with the packed 16 bits of the result.
220 */
221 vec4_instruction *
222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
223 enum brw_conditional_mod condition)
224 {
225 vec4_instruction *inst;
226
227 /* original gen4 does type conversion to the destination type
228 * before before comparison, producing garbage results for floating
229 * point comparisons.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 vec4_instruction *
247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
248 {
249 vec4_instruction *inst;
250
251 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
252 dst, index);
253 inst->base_mrf = 14;
254 inst->mlen = 2;
255
256 return inst;
257 }
258
259 vec4_instruction *
260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
261 const src_reg &index)
262 {
263 vec4_instruction *inst;
264
265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
266 dst, src, index);
267 inst->base_mrf = 13;
268 inst->mlen = 3;
269
270 return inst;
271 }
272
273 void
274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
275 {
276 static enum opcode dot_opcodes[] = {
277 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
278 };
279
280 emit(dot_opcodes[elements - 2], dst, src0, src1);
281 }
282
283 src_reg
284 vec4_visitor::fix_3src_operand(src_reg src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
306 return src_reg(expanded);
307 }
308
309 src_reg
310 vec4_visitor::fix_math_operand(src_reg src)
311 {
312 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
313 return src;
314
315 /* The gen6 math instruction ignores the source modifiers --
316 * swizzle, abs, negate, and at least some parts of the register
317 * region description.
318 *
319 * Rather than trying to enumerate all these cases, *always* expand the
320 * operand to a temp GRF for gen6.
321 *
322 * For gen7, keep the operand as-is, except if immediate, which gen7 still
323 * can't use.
324 */
325
326 if (brw->gen == 7 && src.file != IMM)
327 return src;
328
329 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
330 expanded.type = src.type;
331 emit(MOV(expanded, src));
332 return src_reg(expanded);
333 }
334
335 void
336 vec4_visitor::emit_math(enum opcode opcode,
337 const dst_reg &dst,
338 const src_reg &src0, const src_reg &src1)
339 {
340 vec4_instruction *math =
341 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
342
343 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
344 /* MATH on Gen6 must be align1, so we can't do writemasks. */
345 math->dst = dst_reg(this, glsl_type::vec4_type);
346 math->dst.type = dst.type;
347 emit(MOV(dst, src_reg(math->dst)));
348 } else if (brw->gen < 6) {
349 math->base_mrf = 1;
350 math->mlen = src1.file == BAD_FILE ? 1 : 2;
351 }
352 }
353
354 void
355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
356 {
357 if (brw->gen < 7) {
358 unreachable("ir_unop_pack_half_2x16 should be lowered");
359 }
360
361 assert(dst.type == BRW_REGISTER_TYPE_UD);
362 assert(src0.type == BRW_REGISTER_TYPE_F);
363
364 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
365 *
366 * Because this instruction does not have a 16-bit floating-point type,
367 * the destination data type must be Word (W).
368 *
369 * The destination must be DWord-aligned and specify a horizontal stride
370 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
371 * each destination channel and the upper word is not modified.
372 *
373 * The above restriction implies that the f32to16 instruction must use
374 * align1 mode, because only in align1 mode is it possible to specify
375 * horizontal stride. We choose here to defy the hardware docs and emit
376 * align16 instructions.
377 *
378 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
379 * instructions. I was partially successful in that the code passed all
380 * tests. However, the code was dubiously correct and fragile, and the
381 * tests were not harsh enough to probe that frailty. Not trusting the
382 * code, I chose instead to remain in align16 mode in defiance of the hw
383 * docs).
384 *
385 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
386 * simulator, emitting a f32to16 in align16 mode with UD as destination
387 * data type is safe. The behavior differs from that specified in the PRM
388 * in that the upper word of each destination channel is cleared to 0.
389 */
390
391 dst_reg tmp_dst(this, glsl_type::uvec2_type);
392 src_reg tmp_src(tmp_dst);
393
394 #if 0
395 /* Verify the undocumented behavior on which the following instructions
396 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
397 * then the result of the bit-or instruction below will be incorrect.
398 *
399 * You should inspect the disasm output in order to verify that the MOV is
400 * not optimized away.
401 */
402 emit(MOV(tmp_dst, src_reg(0x12345678u)));
403 #endif
404
405 /* Give tmp the form below, where "." means untouched.
406 *
407 * w z y x w z y x
408 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
409 *
410 * That the upper word of each write-channel be 0 is required for the
411 * following bit-shift and bit-or instructions to work. Note that this
412 * relies on the undocumented hardware behavior mentioned above.
413 */
414 tmp_dst.writemask = WRITEMASK_XY;
415 emit(F32TO16(tmp_dst, src0));
416
417 /* Give the write-channels of dst the form:
418 * 0xhhhh0000
419 */
420 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
421 emit(SHL(dst, tmp_src, src_reg(16u)));
422
423 /* Finally, give the write-channels of dst the form of packHalf2x16's
424 * output:
425 * 0xhhhhllll
426 */
427 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
428 emit(OR(dst, src_reg(dst), tmp_src));
429 }
430
431 void
432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
433 {
434 if (brw->gen < 7) {
435 unreachable("ir_unop_unpack_half_2x16 should be lowered");
436 }
437
438 assert(dst.type == BRW_REGISTER_TYPE_F);
439 assert(src0.type == BRW_REGISTER_TYPE_UD);
440
441 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
442 *
443 * Because this instruction does not have a 16-bit floating-point type,
444 * the source data type must be Word (W). The destination type must be
445 * F (Float).
446 *
447 * To use W as the source data type, we must adjust horizontal strides,
448 * which is only possible in align1 mode. All my [chadv] attempts at
449 * emitting align1 instructions for unpackHalf2x16 failed to pass the
450 * Piglit tests, so I gave up.
451 *
452 * I've verified that, on gen7 hardware and the simulator, it is safe to
453 * emit f16to32 in align16 mode with UD as source data type.
454 */
455
456 dst_reg tmp_dst(this, glsl_type::uvec2_type);
457 src_reg tmp_src(tmp_dst);
458
459 tmp_dst.writemask = WRITEMASK_X;
460 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
461
462 tmp_dst.writemask = WRITEMASK_Y;
463 emit(SHR(tmp_dst, src0, src_reg(16u)));
464
465 dst.writemask = WRITEMASK_XY;
466 emit(F16TO32(dst, tmp_src));
467 }
468
469 void
470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
471 {
472 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
473 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
474 * is not suitable to generate the shift values, but we can use the packed
475 * vector float and a type-converting MOV.
476 */
477 dst_reg shift(this, glsl_type::uvec4_type);
478 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
479
480 dst_reg shifted(this, glsl_type::uvec4_type);
481 src0.swizzle = BRW_SWIZZLE_XXXX;
482 emit(SHR(shifted, src0, src_reg(shift)));
483
484 shifted.type = BRW_REGISTER_TYPE_UB;
485 dst_reg f(this, glsl_type::vec4_type);
486 emit(MOV(f, src_reg(shifted)));
487
488 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
489 }
490
491 void
492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
493 {
494 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
495 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
496 * is not suitable to generate the shift values, but we can use the packed
497 * vector float and a type-converting MOV.
498 */
499 dst_reg shift(this, glsl_type::uvec4_type);
500 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
501
502 dst_reg shifted(this, glsl_type::uvec4_type);
503 src0.swizzle = BRW_SWIZZLE_XXXX;
504 emit(SHR(shifted, src0, src_reg(shift)));
505
506 shifted.type = BRW_REGISTER_TYPE_B;
507 dst_reg f(this, glsl_type::vec4_type);
508 emit(MOV(f, src_reg(shifted)));
509
510 dst_reg scaled(this, glsl_type::vec4_type);
511 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
512
513 dst_reg max(this, glsl_type::vec4_type);
514 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
515 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
516 }
517
518 void
519 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
520 {
521 dst_reg saturated(this, glsl_type::vec4_type);
522 vec4_instruction *inst = emit(MOV(saturated, src0));
523 inst->saturate = true;
524
525 dst_reg scaled(this, glsl_type::vec4_type);
526 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
527
528 dst_reg rounded(this, glsl_type::vec4_type);
529 emit(RNDE(rounded, src_reg(scaled)));
530
531 dst_reg u(this, glsl_type::uvec4_type);
532 emit(MOV(u, src_reg(rounded)));
533
534 src_reg bytes(u);
535 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
536 }
537
538 void
539 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
540 {
541 dst_reg max(this, glsl_type::vec4_type);
542 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
543
544 dst_reg min(this, glsl_type::vec4_type);
545 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
546
547 dst_reg scaled(this, glsl_type::vec4_type);
548 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
549
550 dst_reg rounded(this, glsl_type::vec4_type);
551 emit(RNDE(rounded, src_reg(scaled)));
552
553 dst_reg i(this, glsl_type::ivec4_type);
554 emit(MOV(i, src_reg(rounded)));
555
556 src_reg bytes(i);
557 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
558 }
559
560 void
561 vec4_visitor::visit_instructions(const exec_list *list)
562 {
563 foreach_in_list(ir_instruction, ir, list) {
564 base_ir = ir;
565 ir->accept(this);
566 }
567 }
568
569
570 static int
571 type_size(const struct glsl_type *type)
572 {
573 unsigned int i;
574 int size;
575
576 switch (type->base_type) {
577 case GLSL_TYPE_UINT:
578 case GLSL_TYPE_INT:
579 case GLSL_TYPE_FLOAT:
580 case GLSL_TYPE_BOOL:
581 if (type->is_matrix()) {
582 return type->matrix_columns;
583 } else {
584 /* Regardless of size of vector, it gets a vec4. This is bad
585 * packing for things like floats, but otherwise arrays become a
586 * mess. Hopefully a later pass over the code can pack scalars
587 * down if appropriate.
588 */
589 return 1;
590 }
591 case GLSL_TYPE_ARRAY:
592 assert(type->length > 0);
593 return type_size(type->fields.array) * type->length;
594 case GLSL_TYPE_STRUCT:
595 size = 0;
596 for (i = 0; i < type->length; i++) {
597 size += type_size(type->fields.structure[i].type);
598 }
599 return size;
600 case GLSL_TYPE_SAMPLER:
601 /* Samplers take up no register space, since they're baked in at
602 * link time.
603 */
604 return 0;
605 case GLSL_TYPE_ATOMIC_UINT:
606 return 0;
607 case GLSL_TYPE_IMAGE:
608 case GLSL_TYPE_VOID:
609 case GLSL_TYPE_ERROR:
610 case GLSL_TYPE_INTERFACE:
611 unreachable("not reached");
612 }
613
614 return 0;
615 }
616
617 int
618 vec4_visitor::virtual_grf_alloc(int size)
619 {
620 if (virtual_grf_array_size <= virtual_grf_count) {
621 if (virtual_grf_array_size == 0)
622 virtual_grf_array_size = 16;
623 else
624 virtual_grf_array_size *= 2;
625 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
626 virtual_grf_array_size);
627 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
628 virtual_grf_array_size);
629 }
630 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
631 virtual_grf_reg_count += size;
632 virtual_grf_sizes[virtual_grf_count] = size;
633 return virtual_grf_count++;
634 }
635
636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
637 {
638 init();
639
640 this->file = GRF;
641 this->reg = v->virtual_grf_alloc(type_size(type));
642
643 if (type->is_array() || type->is_record()) {
644 this->swizzle = BRW_SWIZZLE_NOOP;
645 } else {
646 this->swizzle = swizzle_for_size(type->vector_elements);
647 }
648
649 this->type = brw_type_for_base_type(type);
650 }
651
652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
653 {
654 assert(size > 0);
655
656 init();
657
658 this->file = GRF;
659 this->reg = v->virtual_grf_alloc(type_size(type) * size);
660
661 this->swizzle = BRW_SWIZZLE_NOOP;
662
663 this->type = brw_type_for_base_type(type);
664 }
665
666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
667 {
668 init();
669
670 this->file = GRF;
671 this->reg = v->virtual_grf_alloc(type_size(type));
672
673 if (type->is_array() || type->is_record()) {
674 this->writemask = WRITEMASK_XYZW;
675 } else {
676 this->writemask = (1 << type->vector_elements) - 1;
677 }
678
679 this->type = brw_type_for_base_type(type);
680 }
681
682 /* Our support for uniforms is piggy-backed on the struct
683 * gl_fragment_program, because that's where the values actually
684 * get stored, rather than in some global gl_shader_program uniform
685 * store.
686 */
687 void
688 vec4_visitor::setup_uniform_values(ir_variable *ir)
689 {
690 int namelen = strlen(ir->name);
691
692 /* The data for our (non-builtin) uniforms is stored in a series of
693 * gl_uniform_driver_storage structs for each subcomponent that
694 * glGetUniformLocation() could name. We know it's been set up in the same
695 * order we'd walk the type, so walk the list of storage and find anything
696 * with our name, or the prefix of a component that starts with our name.
697 */
698 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
699 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
700
701 if (strncmp(ir->name, storage->name, namelen) != 0 ||
702 (storage->name[namelen] != 0 &&
703 storage->name[namelen] != '.' &&
704 storage->name[namelen] != '[')) {
705 continue;
706 }
707
708 gl_constant_value *components = storage->storage;
709 unsigned vector_count = (MAX2(storage->array_elements, 1) *
710 storage->type->matrix_columns);
711
712 for (unsigned s = 0; s < vector_count; s++) {
713 assert(uniforms < uniform_array_size);
714 uniform_vector_size[uniforms] = storage->type->vector_elements;
715
716 int i;
717 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
718 stage_prog_data->param[uniforms * 4 + i] = components;
719 components++;
720 }
721 for (; i < 4; i++) {
722 static gl_constant_value zero = { 0.0 };
723 stage_prog_data->param[uniforms * 4 + i] = &zero;
724 }
725
726 uniforms++;
727 }
728 }
729 }
730
731 void
732 vec4_visitor::setup_uniform_clipplane_values()
733 {
734 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
735
736 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
737 assert(this->uniforms < uniform_array_size);
738 this->uniform_vector_size[this->uniforms] = 4;
739 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
740 this->userplane[i].type = BRW_REGISTER_TYPE_F;
741 for (int j = 0; j < 4; ++j) {
742 stage_prog_data->param[this->uniforms * 4 + j] =
743 (gl_constant_value *) &clip_planes[i][j];
744 }
745 ++this->uniforms;
746 }
747 }
748
749 /* Our support for builtin uniforms is even scarier than non-builtin.
750 * It sits on top of the PROG_STATE_VAR parameters that are
751 * automatically updated from GL context state.
752 */
753 void
754 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
755 {
756 const ir_state_slot *const slots = ir->get_state_slots();
757 assert(slots != NULL);
758
759 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
760 /* This state reference has already been setup by ir_to_mesa,
761 * but we'll get the same index back here. We can reference
762 * ParameterValues directly, since unlike brw_fs.cpp, we never
763 * add new state references during compile.
764 */
765 int index = _mesa_add_state_reference(this->prog->Parameters,
766 (gl_state_index *)slots[i].tokens);
767 gl_constant_value *values =
768 &this->prog->Parameters->ParameterValues[index][0];
769
770 assert(this->uniforms < uniform_array_size);
771 this->uniform_vector_size[this->uniforms] = 0;
772 /* Add each of the unique swizzled channels of the element.
773 * This will end up matching the size of the glsl_type of this field.
774 */
775 int last_swiz = -1;
776 for (unsigned int j = 0; j < 4; j++) {
777 int swiz = GET_SWZ(slots[i].swizzle, j);
778 last_swiz = swiz;
779
780 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
781 assert(this->uniforms < uniform_array_size);
782 if (swiz <= last_swiz)
783 this->uniform_vector_size[this->uniforms]++;
784 }
785 this->uniforms++;
786 }
787 }
788
789 dst_reg *
790 vec4_visitor::variable_storage(ir_variable *var)
791 {
792 return (dst_reg *)hash_table_find(this->variable_ht, var);
793 }
794
795 void
796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
797 enum brw_predicate *predicate)
798 {
799 ir_expression *expr = ir->as_expression();
800
801 *predicate = BRW_PREDICATE_NORMAL;
802
803 if (expr && expr->operation != ir_binop_ubo_load) {
804 src_reg op[3];
805 vec4_instruction *inst;
806
807 assert(expr->get_num_operands() <= 3);
808 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
809 expr->operands[i]->accept(this);
810 op[i] = this->result;
811
812 resolve_ud_negate(&op[i]);
813 }
814
815 switch (expr->operation) {
816 case ir_unop_logic_not:
817 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
818 inst->conditional_mod = BRW_CONDITIONAL_Z;
819 break;
820
821 case ir_binop_logic_xor:
822 if (brw->gen <= 5) {
823 src_reg temp = src_reg(this, ir->type);
824 emit(XOR(dst_reg(temp), op[0], op[1]));
825 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
826 } else {
827 inst = emit(XOR(dst_null_d(), op[0], op[1]));
828 }
829 inst->conditional_mod = BRW_CONDITIONAL_NZ;
830 break;
831
832 case ir_binop_logic_or:
833 if (brw->gen <= 5) {
834 src_reg temp = src_reg(this, ir->type);
835 emit(OR(dst_reg(temp), op[0], op[1]));
836 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
837 } else {
838 inst = emit(OR(dst_null_d(), op[0], op[1]));
839 }
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 break;
842
843 case ir_binop_logic_and:
844 if (brw->gen <= 5) {
845 src_reg temp = src_reg(this, ir->type);
846 emit(AND(dst_reg(temp), op[0], op[1]));
847 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
848 } else {
849 inst = emit(AND(dst_null_d(), op[0], op[1]));
850 }
851 inst->conditional_mod = BRW_CONDITIONAL_NZ;
852 break;
853
854 case ir_unop_f2b:
855 if (brw->gen >= 6) {
856 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
857 } else {
858 inst = emit(MOV(dst_null_f(), op[0]));
859 inst->conditional_mod = BRW_CONDITIONAL_NZ;
860 }
861 break;
862
863 case ir_unop_i2b:
864 if (brw->gen >= 6) {
865 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
866 } else {
867 inst = emit(MOV(dst_null_d(), op[0]));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 }
870 break;
871
872 case ir_binop_all_equal:
873 if (brw->gen <= 5) {
874 resolve_bool_comparison(expr->operands[0], &op[0]);
875 resolve_bool_comparison(expr->operands[1], &op[1]);
876 }
877 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
878 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
879 break;
880
881 case ir_binop_any_nequal:
882 if (brw->gen <= 5) {
883 resolve_bool_comparison(expr->operands[0], &op[0]);
884 resolve_bool_comparison(expr->operands[1], &op[1]);
885 }
886 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
887 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
888 break;
889
890 case ir_unop_any:
891 if (brw->gen <= 5) {
892 resolve_bool_comparison(expr->operands[0], &op[0]);
893 }
894 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
895 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
896 break;
897
898 case ir_binop_greater:
899 case ir_binop_gequal:
900 case ir_binop_less:
901 case ir_binop_lequal:
902 case ir_binop_equal:
903 case ir_binop_nequal:
904 if (brw->gen <= 5) {
905 resolve_bool_comparison(expr->operands[0], &op[0]);
906 resolve_bool_comparison(expr->operands[1], &op[1]);
907 }
908 emit(CMP(dst_null_d(), op[0], op[1],
909 brw_conditional_for_comparison(expr->operation)));
910 break;
911
912 case ir_triop_csel: {
913 /* Expand the boolean condition into the flag register. */
914 inst = emit(MOV(dst_null_d(), op[0]));
915 inst->conditional_mod = BRW_CONDITIONAL_NZ;
916
917 /* Select which boolean to return. */
918 dst_reg temp(this, expr->operands[1]->type);
919 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
920 inst->predicate = BRW_PREDICATE_NORMAL;
921
922 /* Expand the result to a condition code. */
923 inst = emit(MOV(dst_null_d(), src_reg(temp)));
924 inst->conditional_mod = BRW_CONDITIONAL_NZ;
925 break;
926 }
927
928 default:
929 unreachable("not reached");
930 }
931 return;
932 }
933
934 ir->accept(this);
935
936 resolve_ud_negate(&this->result);
937
938 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
939 inst->conditional_mod = BRW_CONDITIONAL_NZ;
940 }
941
942 /**
943 * Emit a gen6 IF statement with the comparison folded into the IF
944 * instruction.
945 */
946 void
947 vec4_visitor::emit_if_gen6(ir_if *ir)
948 {
949 ir_expression *expr = ir->condition->as_expression();
950
951 if (expr && expr->operation != ir_binop_ubo_load) {
952 src_reg op[3];
953 dst_reg temp;
954
955 assert(expr->get_num_operands() <= 3);
956 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
957 expr->operands[i]->accept(this);
958 op[i] = this->result;
959 }
960
961 switch (expr->operation) {
962 case ir_unop_logic_not:
963 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
964 return;
965
966 case ir_binop_logic_xor:
967 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
968 return;
969
970 case ir_binop_logic_or:
971 temp = dst_reg(this, glsl_type::bool_type);
972 emit(OR(temp, op[0], op[1]));
973 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
974 return;
975
976 case ir_binop_logic_and:
977 temp = dst_reg(this, glsl_type::bool_type);
978 emit(AND(temp, op[0], op[1]));
979 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
980 return;
981
982 case ir_unop_f2b:
983 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
984 return;
985
986 case ir_unop_i2b:
987 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
988 return;
989
990 case ir_binop_greater:
991 case ir_binop_gequal:
992 case ir_binop_less:
993 case ir_binop_lequal:
994 case ir_binop_equal:
995 case ir_binop_nequal:
996 emit(IF(op[0], op[1],
997 brw_conditional_for_comparison(expr->operation)));
998 return;
999
1000 case ir_binop_all_equal:
1001 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1002 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1003 return;
1004
1005 case ir_binop_any_nequal:
1006 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1007 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1008 return;
1009
1010 case ir_unop_any:
1011 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1012 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1013 return;
1014
1015 case ir_triop_csel: {
1016 /* Expand the boolean condition into the flag register. */
1017 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1018 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1019
1020 /* Select which boolean to return. */
1021 dst_reg temp(this, expr->operands[1]->type);
1022 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1023 inst->predicate = BRW_PREDICATE_NORMAL;
1024
1025 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1026 return;
1027 }
1028
1029 default:
1030 unreachable("not reached");
1031 }
1032 return;
1033 }
1034
1035 ir->condition->accept(this);
1036
1037 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_variable *ir)
1042 {
1043 dst_reg *reg = NULL;
1044
1045 if (variable_storage(ir))
1046 return;
1047
1048 switch (ir->data.mode) {
1049 case ir_var_shader_in:
1050 assert(ir->data.location != -1);
1051 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1052 break;
1053
1054 case ir_var_shader_out:
1055 assert(ir->data.location != -1);
1056 reg = new(mem_ctx) dst_reg(this, ir->type);
1057
1058 for (int i = 0; i < type_size(ir->type); i++) {
1059 output_reg[ir->data.location + i] = *reg;
1060 output_reg[ir->data.location + i].reg_offset = i;
1061 output_reg[ir->data.location + i].type =
1062 brw_type_for_base_type(ir->type->get_scalar_type());
1063 output_reg_annotation[ir->data.location + i] = ir->name;
1064 }
1065 break;
1066
1067 case ir_var_auto:
1068 case ir_var_temporary:
1069 reg = new(mem_ctx) dst_reg(this, ir->type);
1070 break;
1071
1072 case ir_var_uniform:
1073 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1074
1075 /* Thanks to the lower_ubo_reference pass, we will see only
1076 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1077 * variables, so no need for them to be in variable_ht.
1078 *
1079 * Some uniforms, such as samplers and atomic counters, have no actual
1080 * storage, so we should ignore them.
1081 */
1082 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1083 return;
1084
1085 /* Track how big the whole uniform variable is, in case we need to put a
1086 * copy of its data into pull constants for array access.
1087 */
1088 assert(this->uniforms < uniform_array_size);
1089 this->uniform_size[this->uniforms] = type_size(ir->type);
1090
1091 if (!strncmp(ir->name, "gl_", 3)) {
1092 setup_builtin_uniform_values(ir);
1093 } else {
1094 setup_uniform_values(ir);
1095 }
1096 break;
1097
1098 case ir_var_system_value:
1099 reg = make_reg_for_system_value(ir);
1100 break;
1101
1102 default:
1103 unreachable("not reached");
1104 }
1105
1106 reg->type = brw_type_for_base_type(ir->type);
1107 hash_table_insert(this->variable_ht, reg, ir);
1108 }
1109
1110 void
1111 vec4_visitor::visit(ir_loop *ir)
1112 {
1113 /* We don't want debugging output to print the whole body of the
1114 * loop as the annotation.
1115 */
1116 this->base_ir = NULL;
1117
1118 emit(BRW_OPCODE_DO);
1119
1120 visit_instructions(&ir->body_instructions);
1121
1122 emit(BRW_OPCODE_WHILE);
1123 }
1124
1125 void
1126 vec4_visitor::visit(ir_loop_jump *ir)
1127 {
1128 switch (ir->mode) {
1129 case ir_loop_jump::jump_break:
1130 emit(BRW_OPCODE_BREAK);
1131 break;
1132 case ir_loop_jump::jump_continue:
1133 emit(BRW_OPCODE_CONTINUE);
1134 break;
1135 }
1136 }
1137
1138
1139 void
1140 vec4_visitor::visit(ir_function_signature *)
1141 {
1142 unreachable("not reached");
1143 }
1144
1145 void
1146 vec4_visitor::visit(ir_function *ir)
1147 {
1148 /* Ignore function bodies other than main() -- we shouldn't see calls to
1149 * them since they should all be inlined.
1150 */
1151 if (strcmp(ir->name, "main") == 0) {
1152 const ir_function_signature *sig;
1153 exec_list empty;
1154
1155 sig = ir->matching_signature(NULL, &empty, false);
1156
1157 assert(sig);
1158
1159 visit_instructions(&sig->body);
1160 }
1161 }
1162
1163 bool
1164 vec4_visitor::try_emit_mad(ir_expression *ir)
1165 {
1166 /* 3-src instructions were introduced in gen6. */
1167 if (brw->gen < 6)
1168 return false;
1169
1170 /* MAD can only handle floating-point data. */
1171 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1172 return false;
1173
1174 ir_rvalue *nonmul = ir->operands[1];
1175 ir_expression *mul = ir->operands[0]->as_expression();
1176
1177 if (!mul || mul->operation != ir_binop_mul) {
1178 nonmul = ir->operands[0];
1179 mul = ir->operands[1]->as_expression();
1180
1181 if (!mul || mul->operation != ir_binop_mul)
1182 return false;
1183 }
1184
1185 nonmul->accept(this);
1186 src_reg src0 = fix_3src_operand(this->result);
1187
1188 mul->operands[0]->accept(this);
1189 src_reg src1 = fix_3src_operand(this->result);
1190
1191 mul->operands[1]->accept(this);
1192 src_reg src2 = fix_3src_operand(this->result);
1193
1194 this->result = src_reg(this, ir->type);
1195 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1196
1197 return true;
1198 }
1199
1200 bool
1201 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1202 {
1203 /* This optimization relies on CMP setting the destination to 0 when
1204 * false. Early hardware only sets the least significant bit, and
1205 * leaves the other bits undefined. So we can't use it.
1206 */
1207 if (brw->gen < 6)
1208 return false;
1209
1210 ir_expression *const cmp = ir->operands[0]->as_expression();
1211
1212 if (cmp == NULL)
1213 return false;
1214
1215 switch (cmp->operation) {
1216 case ir_binop_less:
1217 case ir_binop_greater:
1218 case ir_binop_lequal:
1219 case ir_binop_gequal:
1220 case ir_binop_equal:
1221 case ir_binop_nequal:
1222 break;
1223
1224 default:
1225 return false;
1226 }
1227
1228 cmp->operands[0]->accept(this);
1229 const src_reg cmp_src0 = this->result;
1230
1231 cmp->operands[1]->accept(this);
1232 const src_reg cmp_src1 = this->result;
1233
1234 this->result = src_reg(this, ir->type);
1235
1236 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1237 brw_conditional_for_comparison(cmp->operation)));
1238
1239 /* If the comparison is false, this->result will just happen to be zero.
1240 */
1241 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1242 this->result, src_reg(1.0f));
1243 inst->predicate = BRW_PREDICATE_NORMAL;
1244 inst->predicate_inverse = true;
1245
1246 return true;
1247 }
1248
1249 void
1250 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1251 src_reg src0, src_reg src1)
1252 {
1253 vec4_instruction *inst;
1254
1255 if (brw->gen >= 6) {
1256 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1257 inst->conditional_mod = conditionalmod;
1258 } else {
1259 emit(CMP(dst, src0, src1, conditionalmod));
1260
1261 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1262 inst->predicate = BRW_PREDICATE_NORMAL;
1263 }
1264 }
1265
1266 void
1267 vec4_visitor::emit_lrp(const dst_reg &dst,
1268 const src_reg &x, const src_reg &y, const src_reg &a)
1269 {
1270 if (brw->gen >= 6) {
1271 /* Note that the instruction's argument order is reversed from GLSL
1272 * and the IR.
1273 */
1274 emit(LRP(dst,
1275 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1276 } else {
1277 /* Earlier generations don't support three source operations, so we
1278 * need to emit x*(1-a) + y*a.
1279 */
1280 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1281 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1282 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1283 y_times_a.writemask = dst.writemask;
1284 one_minus_a.writemask = dst.writemask;
1285 x_times_one_minus_a.writemask = dst.writemask;
1286
1287 emit(MUL(y_times_a, y, a));
1288 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1289 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1290 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1291 }
1292 }
1293
1294 void
1295 vec4_visitor::visit(ir_expression *ir)
1296 {
1297 unsigned int operand;
1298 src_reg op[Elements(ir->operands)];
1299 vec4_instruction *inst;
1300
1301 if (ir->operation == ir_binop_add) {
1302 if (try_emit_mad(ir))
1303 return;
1304 }
1305
1306 if (ir->operation == ir_unop_b2f) {
1307 if (try_emit_b2f_of_compare(ir))
1308 return;
1309 }
1310
1311 /* Storage for our result. Ideally for an assignment we'd be using
1312 * the actual storage for the result here, instead.
1313 */
1314 dst_reg result_dst(this, ir->type);
1315 src_reg result_src(result_dst);
1316
1317 if (ir->operation == ir_triop_csel) {
1318 ir->operands[1]->accept(this);
1319 op[1] = this->result;
1320 ir->operands[2]->accept(this);
1321 op[2] = this->result;
1322
1323 enum brw_predicate predicate;
1324 emit_bool_to_cond_code(ir->operands[0], &predicate);
1325 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1326 inst->predicate = predicate;
1327 this->result = result_src;
1328 return;
1329 }
1330
1331 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1332 this->result.file = BAD_FILE;
1333 ir->operands[operand]->accept(this);
1334 if (this->result.file == BAD_FILE) {
1335 fprintf(stderr, "Failed to get tree for expression operand:\n");
1336 ir->operands[operand]->fprint(stderr);
1337 exit(1);
1338 }
1339 op[operand] = this->result;
1340
1341 /* Matrix expression operands should have been broken down to vector
1342 * operations already.
1343 */
1344 assert(!ir->operands[operand]->type->is_matrix());
1345 }
1346
1347 /* If nothing special happens, this is the result. */
1348 this->result = result_src;
1349
1350 switch (ir->operation) {
1351 case ir_unop_logic_not:
1352 emit(NOT(result_dst, op[0]));
1353 break;
1354 case ir_unop_neg:
1355 op[0].negate = !op[0].negate;
1356 emit(MOV(result_dst, op[0]));
1357 break;
1358 case ir_unop_abs:
1359 op[0].abs = true;
1360 op[0].negate = false;
1361 emit(MOV(result_dst, op[0]));
1362 break;
1363
1364 case ir_unop_sign:
1365 if (ir->type->is_float()) {
1366 /* AND(val, 0x80000000) gives the sign bit.
1367 *
1368 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1369 * zero.
1370 */
1371 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1372
1373 op[0].type = BRW_REGISTER_TYPE_UD;
1374 result_dst.type = BRW_REGISTER_TYPE_UD;
1375 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1376
1377 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1378 inst->predicate = BRW_PREDICATE_NORMAL;
1379
1380 this->result.type = BRW_REGISTER_TYPE_F;
1381 } else {
1382 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1383 * -> non-negative val generates 0x00000000.
1384 * Predicated OR sets 1 if val is positive.
1385 */
1386 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1387
1388 emit(ASR(result_dst, op[0], src_reg(31)));
1389
1390 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1391 inst->predicate = BRW_PREDICATE_NORMAL;
1392 }
1393 break;
1394
1395 case ir_unop_rcp:
1396 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1397 break;
1398
1399 case ir_unop_exp2:
1400 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1401 break;
1402 case ir_unop_log2:
1403 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1404 break;
1405 case ir_unop_exp:
1406 case ir_unop_log:
1407 unreachable("not reached: should be handled by ir_explog_to_explog2");
1408 case ir_unop_sin:
1409 case ir_unop_sin_reduced:
1410 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1411 break;
1412 case ir_unop_cos:
1413 case ir_unop_cos_reduced:
1414 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1415 break;
1416
1417 case ir_unop_dFdx:
1418 case ir_unop_dFdx_coarse:
1419 case ir_unop_dFdx_fine:
1420 case ir_unop_dFdy:
1421 case ir_unop_dFdy_coarse:
1422 case ir_unop_dFdy_fine:
1423 unreachable("derivatives not valid in vertex shader");
1424
1425 case ir_unop_bitfield_reverse:
1426 emit(BFREV(result_dst, op[0]));
1427 break;
1428 case ir_unop_bit_count:
1429 emit(CBIT(result_dst, op[0]));
1430 break;
1431 case ir_unop_find_msb: {
1432 src_reg temp = src_reg(this, glsl_type::uint_type);
1433
1434 inst = emit(FBH(dst_reg(temp), op[0]));
1435 inst->dst.writemask = WRITEMASK_XYZW;
1436
1437 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1438 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1439 * subtract the result from 31 to convert the MSB count into an LSB count.
1440 */
1441
1442 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1443 temp.swizzle = BRW_SWIZZLE_NOOP;
1444 emit(MOV(result_dst, temp));
1445
1446 src_reg src_tmp = src_reg(result_dst);
1447 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1448
1449 src_tmp.negate = true;
1450 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1451 inst->predicate = BRW_PREDICATE_NORMAL;
1452 break;
1453 }
1454 case ir_unop_find_lsb:
1455 emit(FBL(result_dst, op[0]));
1456 break;
1457 case ir_unop_saturate:
1458 inst = emit(MOV(result_dst, op[0]));
1459 inst->saturate = true;
1460 break;
1461
1462 case ir_unop_noise:
1463 unreachable("not reached: should be handled by lower_noise");
1464
1465 case ir_binop_add:
1466 emit(ADD(result_dst, op[0], op[1]));
1467 break;
1468 case ir_binop_sub:
1469 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1470
1471 case ir_binop_mul:
1472 if (brw->gen < 8 && ir->type->is_integer()) {
1473 /* For integer multiplication, the MUL uses the low 16 bits of one of
1474 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1475 * accumulates in the contribution of the upper 16 bits of that
1476 * operand. If we can determine that one of the args is in the low
1477 * 16 bits, though, we can just emit a single MUL.
1478 */
1479 if (ir->operands[0]->is_uint16_constant()) {
1480 if (brw->gen < 7)
1481 emit(MUL(result_dst, op[0], op[1]));
1482 else
1483 emit(MUL(result_dst, op[1], op[0]));
1484 } else if (ir->operands[1]->is_uint16_constant()) {
1485 if (brw->gen < 7)
1486 emit(MUL(result_dst, op[1], op[0]));
1487 else
1488 emit(MUL(result_dst, op[0], op[1]));
1489 } else {
1490 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1491
1492 emit(MUL(acc, op[0], op[1]));
1493 emit(MACH(dst_null_d(), op[0], op[1]));
1494 emit(MOV(result_dst, src_reg(acc)));
1495 }
1496 } else {
1497 emit(MUL(result_dst, op[0], op[1]));
1498 }
1499 break;
1500 case ir_binop_imul_high: {
1501 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1502
1503 emit(MUL(acc, op[0], op[1]));
1504 emit(MACH(result_dst, op[0], op[1]));
1505 break;
1506 }
1507 case ir_binop_div:
1508 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1509 assert(ir->type->is_integer());
1510 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1511 break;
1512 case ir_binop_carry: {
1513 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1514
1515 emit(ADDC(dst_null_ud(), op[0], op[1]));
1516 emit(MOV(result_dst, src_reg(acc)));
1517 break;
1518 }
1519 case ir_binop_borrow: {
1520 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1521
1522 emit(SUBB(dst_null_ud(), op[0], op[1]));
1523 emit(MOV(result_dst, src_reg(acc)));
1524 break;
1525 }
1526 case ir_binop_mod:
1527 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1528 assert(ir->type->is_integer());
1529 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1530 break;
1531
1532 case ir_binop_less:
1533 case ir_binop_greater:
1534 case ir_binop_lequal:
1535 case ir_binop_gequal:
1536 case ir_binop_equal:
1537 case ir_binop_nequal: {
1538 if (brw->gen <= 5) {
1539 resolve_bool_comparison(ir->operands[0], &op[0]);
1540 resolve_bool_comparison(ir->operands[1], &op[1]);
1541 }
1542 emit(CMP(result_dst, op[0], op[1],
1543 brw_conditional_for_comparison(ir->operation)));
1544 break;
1545 }
1546
1547 case ir_binop_all_equal:
1548 /* "==" operator producing a scalar boolean. */
1549 if (ir->operands[0]->type->is_vector() ||
1550 ir->operands[1]->type->is_vector()) {
1551 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1552 emit(MOV(result_dst, src_reg(0)));
1553 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1554 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1555 } else {
1556 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1557 }
1558 break;
1559 case ir_binop_any_nequal:
1560 /* "!=" operator producing a scalar boolean. */
1561 if (ir->operands[0]->type->is_vector() ||
1562 ir->operands[1]->type->is_vector()) {
1563 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1564
1565 emit(MOV(result_dst, src_reg(0)));
1566 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1567 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1568 } else {
1569 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1570 }
1571 break;
1572
1573 case ir_unop_any:
1574 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1575 emit(MOV(result_dst, src_reg(0)));
1576
1577 inst = emit(MOV(result_dst, src_reg((int)ctx->Const.UniformBooleanTrue)));
1578 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1579 break;
1580
1581 case ir_binop_logic_xor:
1582 emit(XOR(result_dst, op[0], op[1]));
1583 break;
1584
1585 case ir_binop_logic_or:
1586 emit(OR(result_dst, op[0], op[1]));
1587 break;
1588
1589 case ir_binop_logic_and:
1590 emit(AND(result_dst, op[0], op[1]));
1591 break;
1592
1593 case ir_binop_dot:
1594 assert(ir->operands[0]->type->is_vector());
1595 assert(ir->operands[0]->type == ir->operands[1]->type);
1596 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1597 break;
1598
1599 case ir_unop_sqrt:
1600 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1601 break;
1602 case ir_unop_rsq:
1603 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1604 break;
1605
1606 case ir_unop_bitcast_i2f:
1607 case ir_unop_bitcast_u2f:
1608 this->result = op[0];
1609 this->result.type = BRW_REGISTER_TYPE_F;
1610 break;
1611
1612 case ir_unop_bitcast_f2i:
1613 this->result = op[0];
1614 this->result.type = BRW_REGISTER_TYPE_D;
1615 break;
1616
1617 case ir_unop_bitcast_f2u:
1618 this->result = op[0];
1619 this->result.type = BRW_REGISTER_TYPE_UD;
1620 break;
1621
1622 case ir_unop_i2f:
1623 case ir_unop_i2u:
1624 case ir_unop_u2i:
1625 case ir_unop_u2f:
1626 case ir_unop_f2i:
1627 case ir_unop_f2u:
1628 emit(MOV(result_dst, op[0]));
1629 break;
1630 case ir_unop_b2i:
1631 emit(AND(result_dst, op[0], src_reg(1)));
1632 break;
1633 case ir_unop_b2f:
1634 if (brw->gen <= 5) {
1635 resolve_bool_comparison(ir->operands[0], &op[0]);
1636 }
1637 op[0].type = BRW_REGISTER_TYPE_D;
1638 result_dst.type = BRW_REGISTER_TYPE_D;
1639 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1640 result_dst.type = BRW_REGISTER_TYPE_F;
1641 break;
1642 case ir_unop_f2b:
1643 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1644 break;
1645 case ir_unop_i2b:
1646 emit(AND(result_dst, op[0], src_reg(1)));
1647 break;
1648
1649 case ir_unop_trunc:
1650 emit(RNDZ(result_dst, op[0]));
1651 break;
1652 case ir_unop_ceil:
1653 op[0].negate = !op[0].negate;
1654 inst = emit(RNDD(result_dst, op[0]));
1655 this->result.negate = true;
1656 break;
1657 case ir_unop_floor:
1658 inst = emit(RNDD(result_dst, op[0]));
1659 break;
1660 case ir_unop_fract:
1661 inst = emit(FRC(result_dst, op[0]));
1662 break;
1663 case ir_unop_round_even:
1664 emit(RNDE(result_dst, op[0]));
1665 break;
1666
1667 case ir_binop_min:
1668 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1669 break;
1670 case ir_binop_max:
1671 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1672 break;
1673
1674 case ir_binop_pow:
1675 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1676 break;
1677
1678 case ir_unop_bit_not:
1679 inst = emit(NOT(result_dst, op[0]));
1680 break;
1681 case ir_binop_bit_and:
1682 inst = emit(AND(result_dst, op[0], op[1]));
1683 break;
1684 case ir_binop_bit_xor:
1685 inst = emit(XOR(result_dst, op[0], op[1]));
1686 break;
1687 case ir_binop_bit_or:
1688 inst = emit(OR(result_dst, op[0], op[1]));
1689 break;
1690
1691 case ir_binop_lshift:
1692 inst = emit(SHL(result_dst, op[0], op[1]));
1693 break;
1694
1695 case ir_binop_rshift:
1696 if (ir->type->base_type == GLSL_TYPE_INT)
1697 inst = emit(ASR(result_dst, op[0], op[1]));
1698 else
1699 inst = emit(SHR(result_dst, op[0], op[1]));
1700 break;
1701
1702 case ir_binop_bfm:
1703 emit(BFI1(result_dst, op[0], op[1]));
1704 break;
1705
1706 case ir_binop_ubo_load: {
1707 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1708 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1709 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1710 src_reg offset;
1711
1712 /* Now, load the vector from that offset. */
1713 assert(ir->type->is_vector() || ir->type->is_scalar());
1714
1715 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1716 packed_consts.type = result.type;
1717 src_reg surf_index;
1718
1719 if (const_uniform_block) {
1720 /* The block index is a constant, so just emit the binding table entry
1721 * as an immediate.
1722 */
1723 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1724 const_uniform_block->value.u[0]);
1725 } else {
1726 /* The block index is not a constant. Evaluate the index expression
1727 * per-channel and add the base UBO index; the generator will select
1728 * a value from any live channel.
1729 */
1730 surf_index = src_reg(this, glsl_type::uint_type);
1731 emit(ADD(dst_reg(surf_index), op[0],
1732 src_reg(prog_data->base.binding_table.ubo_start)));
1733
1734 /* Assume this may touch any UBO. It would be nice to provide
1735 * a tighter bound, but the array information is already lowered away.
1736 */
1737 brw_mark_surface_used(&prog_data->base,
1738 prog_data->base.binding_table.ubo_start +
1739 shader_prog->NumUniformBlocks - 1);
1740 }
1741
1742 if (const_offset_ir) {
1743 if (brw->gen >= 8) {
1744 /* Store the offset in a GRF so we can send-from-GRF. */
1745 offset = src_reg(this, glsl_type::int_type);
1746 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1747 } else {
1748 /* Immediates are fine on older generations since they'll be moved
1749 * to a (potentially fake) MRF at the generator level.
1750 */
1751 offset = src_reg(const_offset / 16);
1752 }
1753 } else {
1754 offset = src_reg(this, glsl_type::uint_type);
1755 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1756 }
1757
1758 if (brw->gen >= 7) {
1759 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1760 grf_offset.type = offset.type;
1761
1762 emit(MOV(grf_offset, offset));
1763
1764 emit(new(mem_ctx) vec4_instruction(this,
1765 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1766 dst_reg(packed_consts),
1767 surf_index,
1768 src_reg(grf_offset)));
1769 } else {
1770 vec4_instruction *pull =
1771 emit(new(mem_ctx) vec4_instruction(this,
1772 VS_OPCODE_PULL_CONSTANT_LOAD,
1773 dst_reg(packed_consts),
1774 surf_index,
1775 offset));
1776 pull->base_mrf = 14;
1777 pull->mlen = 1;
1778 }
1779
1780 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1781 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1782 const_offset % 16 / 4,
1783 const_offset % 16 / 4,
1784 const_offset % 16 / 4);
1785
1786 /* UBO bools are any nonzero int. We need to convert them to use the
1787 * value of true stored in ctx->Const.UniformBooleanTrue.
1788 */
1789 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1790 emit(CMP(result_dst, packed_consts, src_reg(0u),
1791 BRW_CONDITIONAL_NZ));
1792 } else {
1793 emit(MOV(result_dst, packed_consts));
1794 }
1795 break;
1796 }
1797
1798 case ir_binop_vector_extract:
1799 unreachable("should have been lowered by vec_index_to_cond_assign");
1800
1801 case ir_triop_fma:
1802 op[0] = fix_3src_operand(op[0]);
1803 op[1] = fix_3src_operand(op[1]);
1804 op[2] = fix_3src_operand(op[2]);
1805 /* Note that the instruction's argument order is reversed from GLSL
1806 * and the IR.
1807 */
1808 emit(MAD(result_dst, op[2], op[1], op[0]));
1809 break;
1810
1811 case ir_triop_lrp:
1812 emit_lrp(result_dst, op[0], op[1], op[2]);
1813 break;
1814
1815 case ir_triop_csel:
1816 unreachable("already handled above");
1817 break;
1818
1819 case ir_triop_bfi:
1820 op[0] = fix_3src_operand(op[0]);
1821 op[1] = fix_3src_operand(op[1]);
1822 op[2] = fix_3src_operand(op[2]);
1823 emit(BFI2(result_dst, op[0], op[1], op[2]));
1824 break;
1825
1826 case ir_triop_bitfield_extract:
1827 op[0] = fix_3src_operand(op[0]);
1828 op[1] = fix_3src_operand(op[1]);
1829 op[2] = fix_3src_operand(op[2]);
1830 /* Note that the instruction's argument order is reversed from GLSL
1831 * and the IR.
1832 */
1833 emit(BFE(result_dst, op[2], op[1], op[0]));
1834 break;
1835
1836 case ir_triop_vector_insert:
1837 unreachable("should have been lowered by lower_vector_insert");
1838
1839 case ir_quadop_bitfield_insert:
1840 unreachable("not reached: should be handled by "
1841 "bitfield_insert_to_bfm_bfi\n");
1842
1843 case ir_quadop_vector:
1844 unreachable("not reached: should be handled by lower_quadop_vector");
1845
1846 case ir_unop_pack_half_2x16:
1847 emit_pack_half_2x16(result_dst, op[0]);
1848 break;
1849 case ir_unop_unpack_half_2x16:
1850 emit_unpack_half_2x16(result_dst, op[0]);
1851 break;
1852 case ir_unop_unpack_unorm_4x8:
1853 emit_unpack_unorm_4x8(result_dst, op[0]);
1854 break;
1855 case ir_unop_unpack_snorm_4x8:
1856 emit_unpack_snorm_4x8(result_dst, op[0]);
1857 break;
1858 case ir_unop_pack_unorm_4x8:
1859 emit_pack_unorm_4x8(result_dst, op[0]);
1860 break;
1861 case ir_unop_pack_snorm_4x8:
1862 emit_pack_snorm_4x8(result_dst, op[0]);
1863 break;
1864 case ir_unop_pack_snorm_2x16:
1865 case ir_unop_pack_unorm_2x16:
1866 case ir_unop_unpack_snorm_2x16:
1867 case ir_unop_unpack_unorm_2x16:
1868 unreachable("not reached: should be handled by lower_packing_builtins");
1869 case ir_unop_unpack_half_2x16_split_x:
1870 case ir_unop_unpack_half_2x16_split_y:
1871 case ir_binop_pack_half_2x16_split:
1872 case ir_unop_interpolate_at_centroid:
1873 case ir_binop_interpolate_at_sample:
1874 case ir_binop_interpolate_at_offset:
1875 unreachable("not reached: should not occur in vertex shader");
1876 case ir_binop_ldexp:
1877 unreachable("not reached: should be handled by ldexp_to_arith()");
1878 }
1879 }
1880
1881
1882 void
1883 vec4_visitor::visit(ir_swizzle *ir)
1884 {
1885 src_reg src;
1886 int i = 0;
1887 int swizzle[4];
1888
1889 /* Note that this is only swizzles in expressions, not those on the left
1890 * hand side of an assignment, which do write masking. See ir_assignment
1891 * for that.
1892 */
1893
1894 ir->val->accept(this);
1895 src = this->result;
1896 assert(src.file != BAD_FILE);
1897
1898 for (i = 0; i < ir->type->vector_elements; i++) {
1899 switch (i) {
1900 case 0:
1901 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1902 break;
1903 case 1:
1904 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1905 break;
1906 case 2:
1907 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1908 break;
1909 case 3:
1910 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1911 break;
1912 }
1913 }
1914 for (; i < 4; i++) {
1915 /* Replicate the last channel out. */
1916 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1917 }
1918
1919 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1920
1921 this->result = src;
1922 }
1923
1924 void
1925 vec4_visitor::visit(ir_dereference_variable *ir)
1926 {
1927 const struct glsl_type *type = ir->type;
1928 dst_reg *reg = variable_storage(ir->var);
1929
1930 if (!reg) {
1931 fail("Failed to find variable storage for %s\n", ir->var->name);
1932 this->result = src_reg(brw_null_reg());
1933 return;
1934 }
1935
1936 this->result = src_reg(*reg);
1937
1938 /* System values get their swizzle from the dst_reg writemask */
1939 if (ir->var->data.mode == ir_var_system_value)
1940 return;
1941
1942 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1943 this->result.swizzle = swizzle_for_size(type->vector_elements);
1944 }
1945
1946
1947 int
1948 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1949 {
1950 /* Under normal circumstances array elements are stored consecutively, so
1951 * the stride is equal to the size of the array element.
1952 */
1953 return type_size(ir->type);
1954 }
1955
1956
1957 void
1958 vec4_visitor::visit(ir_dereference_array *ir)
1959 {
1960 ir_constant *constant_index;
1961 src_reg src;
1962 int array_stride = compute_array_stride(ir);
1963
1964 constant_index = ir->array_index->constant_expression_value();
1965
1966 ir->array->accept(this);
1967 src = this->result;
1968
1969 if (constant_index) {
1970 src.reg_offset += constant_index->value.i[0] * array_stride;
1971 } else {
1972 /* Variable index array dereference. It eats the "vec4" of the
1973 * base of the array and an index that offsets the Mesa register
1974 * index.
1975 */
1976 ir->array_index->accept(this);
1977
1978 src_reg index_reg;
1979
1980 if (array_stride == 1) {
1981 index_reg = this->result;
1982 } else {
1983 index_reg = src_reg(this, glsl_type::int_type);
1984
1985 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1986 }
1987
1988 if (src.reladdr) {
1989 src_reg temp = src_reg(this, glsl_type::int_type);
1990
1991 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1992
1993 index_reg = temp;
1994 }
1995
1996 src.reladdr = ralloc(mem_ctx, src_reg);
1997 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1998 }
1999
2000 /* If the type is smaller than a vec4, replicate the last channel out. */
2001 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2002 src.swizzle = swizzle_for_size(ir->type->vector_elements);
2003 else
2004 src.swizzle = BRW_SWIZZLE_NOOP;
2005 src.type = brw_type_for_base_type(ir->type);
2006
2007 this->result = src;
2008 }
2009
2010 void
2011 vec4_visitor::visit(ir_dereference_record *ir)
2012 {
2013 unsigned int i;
2014 const glsl_type *struct_type = ir->record->type;
2015 int offset = 0;
2016
2017 ir->record->accept(this);
2018
2019 for (i = 0; i < struct_type->length; i++) {
2020 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2021 break;
2022 offset += type_size(struct_type->fields.structure[i].type);
2023 }
2024
2025 /* If the type is smaller than a vec4, replicate the last channel out. */
2026 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2027 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2028 else
2029 this->result.swizzle = BRW_SWIZZLE_NOOP;
2030 this->result.type = brw_type_for_base_type(ir->type);
2031
2032 this->result.reg_offset += offset;
2033 }
2034
2035 /**
2036 * We want to be careful in assignment setup to hit the actual storage
2037 * instead of potentially using a temporary like we might with the
2038 * ir_dereference handler.
2039 */
2040 static dst_reg
2041 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2042 {
2043 /* The LHS must be a dereference. If the LHS is a variable indexed array
2044 * access of a vector, it must be separated into a series conditional moves
2045 * before reaching this point (see ir_vec_index_to_cond_assign).
2046 */
2047 assert(ir->as_dereference());
2048 ir_dereference_array *deref_array = ir->as_dereference_array();
2049 if (deref_array) {
2050 assert(!deref_array->array->type->is_vector());
2051 }
2052
2053 /* Use the rvalue deref handler for the most part. We'll ignore
2054 * swizzles in it and write swizzles using writemask, though.
2055 */
2056 ir->accept(v);
2057 return dst_reg(v->result);
2058 }
2059
2060 void
2061 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2062 const struct glsl_type *type,
2063 enum brw_predicate predicate)
2064 {
2065 if (type->base_type == GLSL_TYPE_STRUCT) {
2066 for (unsigned int i = 0; i < type->length; i++) {
2067 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2068 }
2069 return;
2070 }
2071
2072 if (type->is_array()) {
2073 for (unsigned int i = 0; i < type->length; i++) {
2074 emit_block_move(dst, src, type->fields.array, predicate);
2075 }
2076 return;
2077 }
2078
2079 if (type->is_matrix()) {
2080 const struct glsl_type *vec_type;
2081
2082 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2083 type->vector_elements, 1);
2084
2085 for (int i = 0; i < type->matrix_columns; i++) {
2086 emit_block_move(dst, src, vec_type, predicate);
2087 }
2088 return;
2089 }
2090
2091 assert(type->is_scalar() || type->is_vector());
2092
2093 dst->type = brw_type_for_base_type(type);
2094 src->type = dst->type;
2095
2096 dst->writemask = (1 << type->vector_elements) - 1;
2097
2098 src->swizzle = swizzle_for_size(type->vector_elements);
2099
2100 vec4_instruction *inst = emit(MOV(*dst, *src));
2101 inst->predicate = predicate;
2102
2103 dst->reg_offset++;
2104 src->reg_offset++;
2105 }
2106
2107
2108 /* If the RHS processing resulted in an instruction generating a
2109 * temporary value, and it would be easy to rewrite the instruction to
2110 * generate its result right into the LHS instead, do so. This ends
2111 * up reliably removing instructions where it can be tricky to do so
2112 * later without real UD chain information.
2113 */
2114 bool
2115 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2116 dst_reg dst,
2117 src_reg src,
2118 vec4_instruction *pre_rhs_inst,
2119 vec4_instruction *last_rhs_inst)
2120 {
2121 /* This could be supported, but it would take more smarts. */
2122 if (ir->condition)
2123 return false;
2124
2125 if (pre_rhs_inst == last_rhs_inst)
2126 return false; /* No instructions generated to work with. */
2127
2128 /* Make sure the last instruction generated our source reg. */
2129 if (src.file != GRF ||
2130 src.file != last_rhs_inst->dst.file ||
2131 src.reg != last_rhs_inst->dst.reg ||
2132 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2133 src.reladdr ||
2134 src.abs ||
2135 src.negate ||
2136 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2137 return false;
2138
2139 /* Check that that last instruction fully initialized the channels
2140 * we want to use, in the order we want to use them. We could
2141 * potentially reswizzle the operands of many instructions so that
2142 * we could handle out of order channels, but don't yet.
2143 */
2144
2145 for (unsigned i = 0; i < 4; i++) {
2146 if (dst.writemask & (1 << i)) {
2147 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2148 return false;
2149
2150 if (BRW_GET_SWZ(src.swizzle, i) != i)
2151 return false;
2152 }
2153 }
2154
2155 /* Success! Rewrite the instruction. */
2156 last_rhs_inst->dst.file = dst.file;
2157 last_rhs_inst->dst.reg = dst.reg;
2158 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2159 last_rhs_inst->dst.reladdr = dst.reladdr;
2160 last_rhs_inst->dst.writemask &= dst.writemask;
2161
2162 return true;
2163 }
2164
2165 void
2166 vec4_visitor::visit(ir_assignment *ir)
2167 {
2168 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2169 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2170
2171 if (!ir->lhs->type->is_scalar() &&
2172 !ir->lhs->type->is_vector()) {
2173 ir->rhs->accept(this);
2174 src_reg src = this->result;
2175
2176 if (ir->condition) {
2177 emit_bool_to_cond_code(ir->condition, &predicate);
2178 }
2179
2180 /* emit_block_move doesn't account for swizzles in the source register.
2181 * This should be ok, since the source register is a structure or an
2182 * array, and those can't be swizzled. But double-check to be sure.
2183 */
2184 assert(src.swizzle ==
2185 (ir->rhs->type->is_matrix()
2186 ? swizzle_for_size(ir->rhs->type->vector_elements)
2187 : BRW_SWIZZLE_NOOP));
2188
2189 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2190 return;
2191 }
2192
2193 /* Now we're down to just a scalar/vector with writemasks. */
2194 int i;
2195
2196 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2197 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2198
2199 ir->rhs->accept(this);
2200
2201 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2202
2203 src_reg src = this->result;
2204
2205 int swizzles[4];
2206 int first_enabled_chan = 0;
2207 int src_chan = 0;
2208
2209 assert(ir->lhs->type->is_vector() ||
2210 ir->lhs->type->is_scalar());
2211 dst.writemask = ir->write_mask;
2212
2213 for (int i = 0; i < 4; i++) {
2214 if (dst.writemask & (1 << i)) {
2215 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2216 break;
2217 }
2218 }
2219
2220 /* Swizzle a small RHS vector into the channels being written.
2221 *
2222 * glsl ir treats write_mask as dictating how many channels are
2223 * present on the RHS while in our instructions we need to make
2224 * those channels appear in the slots of the vec4 they're written to.
2225 */
2226 for (int i = 0; i < 4; i++) {
2227 if (dst.writemask & (1 << i))
2228 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2229 else
2230 swizzles[i] = first_enabled_chan;
2231 }
2232 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2233 swizzles[2], swizzles[3]);
2234
2235 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2236 return;
2237 }
2238
2239 if (ir->condition) {
2240 emit_bool_to_cond_code(ir->condition, &predicate);
2241 }
2242
2243 for (i = 0; i < type_size(ir->lhs->type); i++) {
2244 vec4_instruction *inst = emit(MOV(dst, src));
2245 inst->predicate = predicate;
2246
2247 dst.reg_offset++;
2248 src.reg_offset++;
2249 }
2250 }
2251
2252 void
2253 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2254 {
2255 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2256 foreach_in_list(ir_constant, field_value, &ir->components) {
2257 emit_constant_values(dst, field_value);
2258 }
2259 return;
2260 }
2261
2262 if (ir->type->is_array()) {
2263 for (unsigned int i = 0; i < ir->type->length; i++) {
2264 emit_constant_values(dst, ir->array_elements[i]);
2265 }
2266 return;
2267 }
2268
2269 if (ir->type->is_matrix()) {
2270 for (int i = 0; i < ir->type->matrix_columns; i++) {
2271 float *vec = &ir->value.f[i * ir->type->vector_elements];
2272
2273 for (int j = 0; j < ir->type->vector_elements; j++) {
2274 dst->writemask = 1 << j;
2275 dst->type = BRW_REGISTER_TYPE_F;
2276
2277 emit(MOV(*dst, src_reg(vec[j])));
2278 }
2279 dst->reg_offset++;
2280 }
2281 return;
2282 }
2283
2284 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2285
2286 for (int i = 0; i < ir->type->vector_elements; i++) {
2287 if (!(remaining_writemask & (1 << i)))
2288 continue;
2289
2290 dst->writemask = 1 << i;
2291 dst->type = brw_type_for_base_type(ir->type);
2292
2293 /* Find other components that match the one we're about to
2294 * write. Emits fewer instructions for things like vec4(0.5,
2295 * 1.5, 1.5, 1.5).
2296 */
2297 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2298 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2299 if (ir->value.b[i] == ir->value.b[j])
2300 dst->writemask |= (1 << j);
2301 } else {
2302 /* u, i, and f storage all line up, so no need for a
2303 * switch case for comparing each type.
2304 */
2305 if (ir->value.u[i] == ir->value.u[j])
2306 dst->writemask |= (1 << j);
2307 }
2308 }
2309
2310 switch (ir->type->base_type) {
2311 case GLSL_TYPE_FLOAT:
2312 emit(MOV(*dst, src_reg(ir->value.f[i])));
2313 break;
2314 case GLSL_TYPE_INT:
2315 emit(MOV(*dst, src_reg(ir->value.i[i])));
2316 break;
2317 case GLSL_TYPE_UINT:
2318 emit(MOV(*dst, src_reg(ir->value.u[i])));
2319 break;
2320 case GLSL_TYPE_BOOL:
2321 emit(MOV(*dst,
2322 src_reg(ir->value.b[i] != 0 ? (int)ctx->Const.UniformBooleanTrue
2323 : 0)));
2324 break;
2325 default:
2326 unreachable("Non-float/uint/int/bool constant");
2327 }
2328
2329 remaining_writemask &= ~dst->writemask;
2330 }
2331 dst->reg_offset++;
2332 }
2333
2334 void
2335 vec4_visitor::visit(ir_constant *ir)
2336 {
2337 dst_reg dst = dst_reg(this, ir->type);
2338 this->result = src_reg(dst);
2339
2340 emit_constant_values(&dst, ir);
2341 }
2342
2343 void
2344 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2345 {
2346 ir_dereference *deref = static_cast<ir_dereference *>(
2347 ir->actual_parameters.get_head());
2348 ir_variable *location = deref->variable_referenced();
2349 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2350 location->data.binding);
2351
2352 /* Calculate the surface offset */
2353 src_reg offset(this, glsl_type::uint_type);
2354 ir_dereference_array *deref_array = deref->as_dereference_array();
2355 if (deref_array) {
2356 deref_array->array_index->accept(this);
2357
2358 src_reg tmp(this, glsl_type::uint_type);
2359 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2360 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2361 } else {
2362 offset = location->data.atomic.offset;
2363 }
2364
2365 /* Emit the appropriate machine instruction */
2366 const char *callee = ir->callee->function_name();
2367 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2368
2369 if (!strcmp("__intrinsic_atomic_read", callee)) {
2370 emit_untyped_surface_read(surf_index, dst, offset);
2371
2372 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2373 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2374 src_reg(), src_reg());
2375
2376 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2377 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2378 src_reg(), src_reg());
2379 }
2380 }
2381
2382 void
2383 vec4_visitor::visit(ir_call *ir)
2384 {
2385 const char *callee = ir->callee->function_name();
2386
2387 if (!strcmp("__intrinsic_atomic_read", callee) ||
2388 !strcmp("__intrinsic_atomic_increment", callee) ||
2389 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2390 visit_atomic_counter_intrinsic(ir);
2391 } else {
2392 unreachable("Unsupported intrinsic.");
2393 }
2394 }
2395
2396 src_reg
2397 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2398 {
2399 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2400 inst->base_mrf = 2;
2401 inst->mlen = 1;
2402 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2403 inst->dst.writemask = WRITEMASK_XYZW;
2404
2405 inst->src[1] = sampler;
2406
2407 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2408 int param_base = inst->base_mrf;
2409 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2410 int zero_mask = 0xf & ~coord_mask;
2411
2412 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2413 coordinate));
2414
2415 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2416 src_reg(0)));
2417
2418 emit(inst);
2419 return src_reg(inst->dst);
2420 }
2421
2422 static bool
2423 is_high_sampler(struct brw_context *brw, src_reg sampler)
2424 {
2425 if (brw->gen < 8 && !brw->is_haswell)
2426 return false;
2427
2428 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2429 }
2430
2431 void
2432 vec4_visitor::visit(ir_texture *ir)
2433 {
2434 uint32_t sampler =
2435 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2436
2437 ir_rvalue *nonconst_sampler_index =
2438 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2439
2440 /* Handle non-constant sampler array indexing */
2441 src_reg sampler_reg;
2442 if (nonconst_sampler_index) {
2443 /* The highest sampler which may be used by this operation is
2444 * the last element of the array. Mark it here, because the generator
2445 * doesn't have enough information to determine the bound.
2446 */
2447 uint32_t array_size = ir->sampler->as_dereference_array()
2448 ->array->type->array_size();
2449
2450 uint32_t max_used = sampler + array_size - 1;
2451 if (ir->op == ir_tg4 && brw->gen < 8) {
2452 max_used += prog_data->base.binding_table.gather_texture_start;
2453 } else {
2454 max_used += prog_data->base.binding_table.texture_start;
2455 }
2456
2457 brw_mark_surface_used(&prog_data->base, max_used);
2458
2459 /* Emit code to evaluate the actual indexing expression */
2460 nonconst_sampler_index->accept(this);
2461 dst_reg temp(this, glsl_type::uint_type);
2462 emit(ADD(temp, this->result, src_reg(sampler)))
2463 ->force_writemask_all = true;
2464 sampler_reg = src_reg(temp);
2465 } else {
2466 /* Single sampler, or constant array index; the indexing expression
2467 * is just an immediate.
2468 */
2469 sampler_reg = src_reg(sampler);
2470 }
2471
2472 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2473 * emitting anything other than setting up the constant result.
2474 */
2475 if (ir->op == ir_tg4) {
2476 ir_constant *chan = ir->lod_info.component->as_constant();
2477 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2478 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2479 dst_reg result(this, ir->type);
2480 this->result = src_reg(result);
2481 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2482 return;
2483 }
2484 }
2485
2486 /* Should be lowered by do_lower_texture_projection */
2487 assert(!ir->projector);
2488
2489 /* Should be lowered */
2490 assert(!ir->offset || !ir->offset->type->is_array());
2491
2492 /* Generate code to compute all the subexpression trees. This has to be
2493 * done before loading any values into MRFs for the sampler message since
2494 * generating these values may involve SEND messages that need the MRFs.
2495 */
2496 src_reg coordinate;
2497 if (ir->coordinate) {
2498 ir->coordinate->accept(this);
2499 coordinate = this->result;
2500 }
2501
2502 src_reg shadow_comparitor;
2503 if (ir->shadow_comparitor) {
2504 ir->shadow_comparitor->accept(this);
2505 shadow_comparitor = this->result;
2506 }
2507
2508 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2509 src_reg offset_value;
2510 if (has_nonconstant_offset) {
2511 ir->offset->accept(this);
2512 offset_value = src_reg(this->result);
2513 }
2514
2515 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2516 src_reg lod, dPdx, dPdy, sample_index, mcs;
2517 switch (ir->op) {
2518 case ir_tex:
2519 lod = src_reg(0.0f);
2520 lod_type = glsl_type::float_type;
2521 break;
2522 case ir_txf:
2523 case ir_txl:
2524 case ir_txs:
2525 ir->lod_info.lod->accept(this);
2526 lod = this->result;
2527 lod_type = ir->lod_info.lod->type;
2528 break;
2529 case ir_query_levels:
2530 lod = src_reg(0);
2531 lod_type = glsl_type::int_type;
2532 break;
2533 case ir_txf_ms:
2534 ir->lod_info.sample_index->accept(this);
2535 sample_index = this->result;
2536 sample_index_type = ir->lod_info.sample_index->type;
2537
2538 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2539 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2540 else
2541 mcs = src_reg(0u);
2542 break;
2543 case ir_txd:
2544 ir->lod_info.grad.dPdx->accept(this);
2545 dPdx = this->result;
2546
2547 ir->lod_info.grad.dPdy->accept(this);
2548 dPdy = this->result;
2549
2550 lod_type = ir->lod_info.grad.dPdx->type;
2551 break;
2552 case ir_txb:
2553 case ir_lod:
2554 case ir_tg4:
2555 break;
2556 }
2557
2558 enum opcode opcode;
2559 switch (ir->op) {
2560 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2561 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2562 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2563 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2564 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2565 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2566 case ir_tg4: opcode = has_nonconstant_offset
2567 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2568 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2569 case ir_txb:
2570 unreachable("TXB is not valid for vertex shaders.");
2571 case ir_lod:
2572 unreachable("LOD is not valid for vertex shaders.");
2573 default:
2574 unreachable("Unrecognized tex op");
2575 }
2576
2577 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2578
2579 if (ir->offset != NULL && !has_nonconstant_offset) {
2580 inst->offset =
2581 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2582 ir->offset->type->vector_elements);
2583 }
2584
2585 /* Stuff the channel select bits in the top of the texture offset */
2586 if (ir->op == ir_tg4)
2587 inst->offset |= gather_channel(ir, sampler) << 16;
2588
2589 /* The message header is necessary for:
2590 * - Gen4 (always)
2591 * - Texel offsets
2592 * - Gather channel selection
2593 * - Sampler indices too large to fit in a 4-bit value.
2594 */
2595 inst->header_present =
2596 brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2597 is_high_sampler(brw, sampler_reg);
2598 inst->base_mrf = 2;
2599 inst->mlen = inst->header_present + 1; /* always at least one */
2600 inst->dst = dst_reg(this, ir->type);
2601 inst->dst.writemask = WRITEMASK_XYZW;
2602 inst->shadow_compare = ir->shadow_comparitor != NULL;
2603
2604 inst->src[1] = sampler_reg;
2605
2606 /* MRF for the first parameter */
2607 int param_base = inst->base_mrf + inst->header_present;
2608
2609 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2610 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2611 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2612 } else {
2613 /* Load the coordinate */
2614 /* FINISHME: gl_clamp_mask and saturate */
2615 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2616 int zero_mask = 0xf & ~coord_mask;
2617
2618 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2619 coordinate));
2620
2621 if (zero_mask != 0) {
2622 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2623 src_reg(0)));
2624 }
2625 /* Load the shadow comparitor */
2626 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2627 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2628 WRITEMASK_X),
2629 shadow_comparitor));
2630 inst->mlen++;
2631 }
2632
2633 /* Load the LOD info */
2634 if (ir->op == ir_tex || ir->op == ir_txl) {
2635 int mrf, writemask;
2636 if (brw->gen >= 5) {
2637 mrf = param_base + 1;
2638 if (ir->shadow_comparitor) {
2639 writemask = WRITEMASK_Y;
2640 /* mlen already incremented */
2641 } else {
2642 writemask = WRITEMASK_X;
2643 inst->mlen++;
2644 }
2645 } else /* brw->gen == 4 */ {
2646 mrf = param_base;
2647 writemask = WRITEMASK_W;
2648 }
2649 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2650 } else if (ir->op == ir_txf) {
2651 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2652 } else if (ir->op == ir_txf_ms) {
2653 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2654 sample_index));
2655 if (brw->gen >= 7) {
2656 /* MCS data is in the first channel of `mcs`, but we need to get it into
2657 * the .y channel of the second vec4 of params, so replicate .x across
2658 * the whole vec4 and then mask off everything except .y
2659 */
2660 mcs.swizzle = BRW_SWIZZLE_XXXX;
2661 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2662 mcs));
2663 }
2664 inst->mlen++;
2665 } else if (ir->op == ir_txd) {
2666 const glsl_type *type = lod_type;
2667
2668 if (brw->gen >= 5) {
2669 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2670 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2671 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2672 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2673 inst->mlen++;
2674
2675 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2676 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2677 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2678 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2679 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2680 inst->mlen++;
2681
2682 if (ir->shadow_comparitor) {
2683 emit(MOV(dst_reg(MRF, param_base + 2,
2684 ir->shadow_comparitor->type, WRITEMASK_Z),
2685 shadow_comparitor));
2686 }
2687 }
2688 } else /* brw->gen == 4 */ {
2689 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2690 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2691 inst->mlen += 2;
2692 }
2693 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2694 if (ir->shadow_comparitor) {
2695 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2696 shadow_comparitor));
2697 }
2698
2699 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2700 offset_value));
2701 inst->mlen++;
2702 }
2703 }
2704
2705 emit(inst);
2706
2707 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2708 * spec requires layers.
2709 */
2710 if (ir->op == ir_txs) {
2711 glsl_type const *type = ir->sampler->type;
2712 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2713 type->sampler_array) {
2714 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2715 writemask(inst->dst, WRITEMASK_Z),
2716 src_reg(inst->dst), src_reg(6));
2717 }
2718 }
2719
2720 if (brw->gen == 6 && ir->op == ir_tg4) {
2721 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2722 }
2723
2724 swizzle_result(ir, src_reg(inst->dst), sampler);
2725 }
2726
2727 /**
2728 * Apply workarounds for Gen6 gather with UINT/SINT
2729 */
2730 void
2731 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2732 {
2733 if (!wa)
2734 return;
2735
2736 int width = (wa & WA_8BIT) ? 8 : 16;
2737 dst_reg dst_f = dst;
2738 dst_f.type = BRW_REGISTER_TYPE_F;
2739
2740 /* Convert from UNORM to UINT */
2741 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2742 emit(MOV(dst, src_reg(dst_f)));
2743
2744 if (wa & WA_SIGN) {
2745 /* Reinterpret the UINT value as a signed INT value by
2746 * shifting the sign bit into place, then shifting back
2747 * preserving sign.
2748 */
2749 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2750 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2751 }
2752 }
2753
2754 /**
2755 * Set up the gather channel based on the swizzle, for gather4.
2756 */
2757 uint32_t
2758 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2759 {
2760 ir_constant *chan = ir->lod_info.component->as_constant();
2761 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2762 switch (swiz) {
2763 case SWIZZLE_X: return 0;
2764 case SWIZZLE_Y:
2765 /* gather4 sampler is broken for green channel on RG32F --
2766 * we must ask for blue instead.
2767 */
2768 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2769 return 2;
2770 return 1;
2771 case SWIZZLE_Z: return 2;
2772 case SWIZZLE_W: return 3;
2773 default:
2774 unreachable("Not reached"); /* zero, one swizzles handled already */
2775 }
2776 }
2777
2778 void
2779 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2780 {
2781 int s = key->tex.swizzles[sampler];
2782
2783 this->result = src_reg(this, ir->type);
2784 dst_reg swizzled_result(this->result);
2785
2786 if (ir->op == ir_query_levels) {
2787 /* # levels is in .w */
2788 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2789 emit(MOV(swizzled_result, orig_val));
2790 return;
2791 }
2792
2793 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2794 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2795 emit(MOV(swizzled_result, orig_val));
2796 return;
2797 }
2798
2799
2800 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2801 int swizzle[4] = {0};
2802
2803 for (int i = 0; i < 4; i++) {
2804 switch (GET_SWZ(s, i)) {
2805 case SWIZZLE_ZERO:
2806 zero_mask |= (1 << i);
2807 break;
2808 case SWIZZLE_ONE:
2809 one_mask |= (1 << i);
2810 break;
2811 default:
2812 copy_mask |= (1 << i);
2813 swizzle[i] = GET_SWZ(s, i);
2814 break;
2815 }
2816 }
2817
2818 if (copy_mask) {
2819 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2820 swizzled_result.writemask = copy_mask;
2821 emit(MOV(swizzled_result, orig_val));
2822 }
2823
2824 if (zero_mask) {
2825 swizzled_result.writemask = zero_mask;
2826 emit(MOV(swizzled_result, src_reg(0.0f)));
2827 }
2828
2829 if (one_mask) {
2830 swizzled_result.writemask = one_mask;
2831 emit(MOV(swizzled_result, src_reg(1.0f)));
2832 }
2833 }
2834
2835 void
2836 vec4_visitor::visit(ir_return *)
2837 {
2838 unreachable("not reached");
2839 }
2840
2841 void
2842 vec4_visitor::visit(ir_discard *)
2843 {
2844 unreachable("not reached");
2845 }
2846
2847 void
2848 vec4_visitor::visit(ir_if *ir)
2849 {
2850 /* Don't point the annotation at the if statement, because then it plus
2851 * the then and else blocks get printed.
2852 */
2853 this->base_ir = ir->condition;
2854
2855 if (brw->gen == 6) {
2856 emit_if_gen6(ir);
2857 } else {
2858 enum brw_predicate predicate;
2859 emit_bool_to_cond_code(ir->condition, &predicate);
2860 emit(IF(predicate));
2861 }
2862
2863 visit_instructions(&ir->then_instructions);
2864
2865 if (!ir->else_instructions.is_empty()) {
2866 this->base_ir = ir->condition;
2867 emit(BRW_OPCODE_ELSE);
2868
2869 visit_instructions(&ir->else_instructions);
2870 }
2871
2872 this->base_ir = ir->condition;
2873 emit(BRW_OPCODE_ENDIF);
2874 }
2875
2876 void
2877 vec4_visitor::visit(ir_emit_vertex *)
2878 {
2879 unreachable("not reached");
2880 }
2881
2882 void
2883 vec4_visitor::visit(ir_end_primitive *)
2884 {
2885 unreachable("not reached");
2886 }
2887
2888 void
2889 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2890 dst_reg dst, src_reg offset,
2891 src_reg src0, src_reg src1)
2892 {
2893 unsigned mlen = 0;
2894
2895 /* Set the atomic operation offset. */
2896 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2897 mlen++;
2898
2899 /* Set the atomic operation arguments. */
2900 if (src0.file != BAD_FILE) {
2901 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2902 mlen++;
2903 }
2904
2905 if (src1.file != BAD_FILE) {
2906 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2907 mlen++;
2908 }
2909
2910 /* Emit the instruction. Note that this maps to the normal SIMD8
2911 * untyped atomic message on Ivy Bridge, but that's OK because
2912 * unused channels will be masked out.
2913 */
2914 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2915 src_reg(atomic_op), src_reg(surf_index));
2916 inst->base_mrf = 0;
2917 inst->mlen = mlen;
2918 }
2919
2920 void
2921 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2922 src_reg offset)
2923 {
2924 /* Set the surface read offset. */
2925 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2926
2927 /* Emit the instruction. Note that this maps to the normal SIMD8
2928 * untyped surface read message, but that's OK because unused
2929 * channels will be masked out.
2930 */
2931 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2932 dst, src_reg(surf_index));
2933 inst->base_mrf = 0;
2934 inst->mlen = 1;
2935 }
2936
2937 void
2938 vec4_visitor::emit_ndc_computation()
2939 {
2940 /* Get the position */
2941 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2942
2943 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2944 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2945 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2946
2947 current_annotation = "NDC";
2948 dst_reg ndc_w = ndc;
2949 ndc_w.writemask = WRITEMASK_W;
2950 src_reg pos_w = pos;
2951 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2952 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2953
2954 dst_reg ndc_xyz = ndc;
2955 ndc_xyz.writemask = WRITEMASK_XYZ;
2956
2957 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2958 }
2959
2960 void
2961 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2962 {
2963 if (brw->gen < 6 &&
2964 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2965 key->userclip_active || brw->has_negative_rhw_bug)) {
2966 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2967 dst_reg header1_w = header1;
2968 header1_w.writemask = WRITEMASK_W;
2969
2970 emit(MOV(header1, 0u));
2971
2972 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2973 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2974
2975 current_annotation = "Point size";
2976 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2977 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2978 }
2979
2980 if (key->userclip_active) {
2981 current_annotation = "Clipping flags";
2982 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2983 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2984
2985 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2986 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2987 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2988
2989 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2990 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2991 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2992 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2993 }
2994
2995 /* i965 clipping workaround:
2996 * 1) Test for -ve rhw
2997 * 2) If set,
2998 * set ndc = (0,0,0,0)
2999 * set ucp[6] = 1
3000 *
3001 * Later, clipping will detect ucp[6] and ensure the primitive is
3002 * clipped against all fixed planes.
3003 */
3004 if (brw->has_negative_rhw_bug) {
3005 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3006 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3007 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3008 vec4_instruction *inst;
3009 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3010 inst->predicate = BRW_PREDICATE_NORMAL;
3011 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3012 inst->predicate = BRW_PREDICATE_NORMAL;
3013 }
3014
3015 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3016 } else if (brw->gen < 6) {
3017 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3018 } else {
3019 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3020 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3021 dst_reg reg_w = reg;
3022 reg_w.writemask = WRITEMASK_W;
3023 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3024 }
3025 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3026 dst_reg reg_y = reg;
3027 reg_y.writemask = WRITEMASK_Y;
3028 reg_y.type = BRW_REGISTER_TYPE_D;
3029 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3030 }
3031 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3032 dst_reg reg_z = reg;
3033 reg_z.writemask = WRITEMASK_Z;
3034 reg_z.type = BRW_REGISTER_TYPE_D;
3035 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3036 }
3037 }
3038 }
3039
3040 void
3041 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3042 {
3043 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3044 *
3045 * "If a linked set of shaders forming the vertex stage contains no
3046 * static write to gl_ClipVertex or gl_ClipDistance, but the
3047 * application has requested clipping against user clip planes through
3048 * the API, then the coordinate written to gl_Position is used for
3049 * comparison against the user clip planes."
3050 *
3051 * This function is only called if the shader didn't write to
3052 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3053 * if the user wrote to it; otherwise we use gl_Position.
3054 */
3055 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3056 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3057 clip_vertex = VARYING_SLOT_POS;
3058 }
3059
3060 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3061 ++i) {
3062 reg.writemask = 1 << i;
3063 emit(DP4(reg,
3064 src_reg(output_reg[clip_vertex]),
3065 src_reg(this->userplane[i + offset])));
3066 }
3067 }
3068
3069 vec4_instruction *
3070 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3071 {
3072 assert (varying < VARYING_SLOT_MAX);
3073 reg.type = output_reg[varying].type;
3074 current_annotation = output_reg_annotation[varying];
3075 /* Copy the register, saturating if necessary */
3076 return emit(MOV(reg, src_reg(output_reg[varying])));
3077 }
3078
3079 void
3080 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3081 {
3082 reg.type = BRW_REGISTER_TYPE_F;
3083
3084 switch (varying) {
3085 case VARYING_SLOT_PSIZ:
3086 {
3087 /* PSIZ is always in slot 0, and is coupled with other flags. */
3088 current_annotation = "indices, point width, clip flags";
3089 emit_psiz_and_flags(reg);
3090 break;
3091 }
3092 case BRW_VARYING_SLOT_NDC:
3093 current_annotation = "NDC";
3094 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3095 break;
3096 case VARYING_SLOT_POS:
3097 current_annotation = "gl_Position";
3098 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3099 break;
3100 case VARYING_SLOT_EDGE:
3101 /* This is present when doing unfilled polygons. We're supposed to copy
3102 * the edge flag from the user-provided vertex array
3103 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3104 * of that attribute (starts as 1.0f). This is then used in clipping to
3105 * determine which edges should be drawn as wireframe.
3106 */
3107 current_annotation = "edge flag";
3108 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3109 glsl_type::float_type, WRITEMASK_XYZW))));
3110 break;
3111 case BRW_VARYING_SLOT_PAD:
3112 /* No need to write to this slot */
3113 break;
3114 case VARYING_SLOT_COL0:
3115 case VARYING_SLOT_COL1:
3116 case VARYING_SLOT_BFC0:
3117 case VARYING_SLOT_BFC1: {
3118 /* These built-in varyings are only supported in compatibility mode,
3119 * and we only support GS in core profile. So, this must be a vertex
3120 * shader.
3121 */
3122 assert(stage == MESA_SHADER_VERTEX);
3123 vec4_instruction *inst = emit_generic_urb_slot(reg, varying);
3124 if (((struct brw_vs_prog_key *) key)->clamp_vertex_color)
3125 inst->saturate = true;
3126 break;
3127 }
3128
3129 default:
3130 emit_generic_urb_slot(reg, varying);
3131 break;
3132 }
3133 }
3134
3135 static int
3136 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3137 {
3138 if (brw->gen >= 6) {
3139 /* URB data written (does not include the message header reg) must
3140 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3141 * section 5.4.3.2.2: URB_INTERLEAVED.
3142 *
3143 * URB entries are allocated on a multiple of 1024 bits, so an
3144 * extra 128 bits written here to make the end align to 256 is
3145 * no problem.
3146 */
3147 if ((mlen % 2) != 1)
3148 mlen++;
3149 }
3150
3151 return mlen;
3152 }
3153
3154
3155 /**
3156 * Generates the VUE payload plus the necessary URB write instructions to
3157 * output it.
3158 *
3159 * The VUE layout is documented in Volume 2a.
3160 */
3161 void
3162 vec4_visitor::emit_vertex()
3163 {
3164 /* MRF 0 is reserved for the debugger, so start with message header
3165 * in MRF 1.
3166 */
3167 int base_mrf = 1;
3168 int mrf = base_mrf;
3169 /* In the process of generating our URB write message contents, we
3170 * may need to unspill a register or load from an array. Those
3171 * reads would use MRFs 14-15.
3172 */
3173 int max_usable_mrf = 13;
3174
3175 /* The following assertion verifies that max_usable_mrf causes an
3176 * even-numbered amount of URB write data, which will meet gen6's
3177 * requirements for length alignment.
3178 */
3179 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3180
3181 /* First mrf is the g0-based message header containing URB handles and
3182 * such.
3183 */
3184 emit_urb_write_header(mrf++);
3185
3186 if (brw->gen < 6) {
3187 emit_ndc_computation();
3188 }
3189
3190 /* Lower legacy ff and ClipVertex clipping to clip distances */
3191 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3192 current_annotation = "user clip distances";
3193
3194 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3195 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3196
3197 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3198 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3199 }
3200
3201 /* We may need to split this up into several URB writes, so do them in a
3202 * loop.
3203 */
3204 int slot = 0;
3205 bool complete = false;
3206 do {
3207 /* URB offset is in URB row increments, and each of our MRFs is half of
3208 * one of those, since we're doing interleaved writes.
3209 */
3210 int offset = slot / 2;
3211
3212 mrf = base_mrf + 1;
3213 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3214 emit_urb_slot(dst_reg(MRF, mrf++),
3215 prog_data->vue_map.slot_to_varying[slot]);
3216
3217 /* If this was max_usable_mrf, we can't fit anything more into this
3218 * URB WRITE.
3219 */
3220 if (mrf > max_usable_mrf) {
3221 slot++;
3222 break;
3223 }
3224 }
3225
3226 complete = slot >= prog_data->vue_map.num_slots;
3227 current_annotation = "URB write";
3228 vec4_instruction *inst = emit_urb_write_opcode(complete);
3229 inst->base_mrf = base_mrf;
3230 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3231 inst->offset += offset;
3232 } while(!complete);
3233 }
3234
3235
3236 src_reg
3237 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3238 src_reg *reladdr, int reg_offset)
3239 {
3240 /* Because we store the values to scratch interleaved like our
3241 * vertex data, we need to scale the vec4 index by 2.
3242 */
3243 int message_header_scale = 2;
3244
3245 /* Pre-gen6, the message header uses byte offsets instead of vec4
3246 * (16-byte) offset units.
3247 */
3248 if (brw->gen < 6)
3249 message_header_scale *= 16;
3250
3251 if (reladdr) {
3252 src_reg index = src_reg(this, glsl_type::int_type);
3253
3254 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3255 src_reg(reg_offset)));
3256 emit_before(block, inst, MUL(dst_reg(index), index,
3257 src_reg(message_header_scale)));
3258
3259 return index;
3260 } else {
3261 return src_reg(reg_offset * message_header_scale);
3262 }
3263 }
3264
3265 src_reg
3266 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3267 src_reg *reladdr, int reg_offset)
3268 {
3269 if (reladdr) {
3270 src_reg index = src_reg(this, glsl_type::int_type);
3271
3272 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3273 src_reg(reg_offset)));
3274
3275 /* Pre-gen6, the message header uses byte offsets instead of vec4
3276 * (16-byte) offset units.
3277 */
3278 if (brw->gen < 6) {
3279 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3280 }
3281
3282 return index;
3283 } else if (brw->gen >= 8) {
3284 /* Store the offset in a GRF so we can send-from-GRF. */
3285 src_reg offset = src_reg(this, glsl_type::int_type);
3286 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3287 return offset;
3288 } else {
3289 int message_header_scale = brw->gen < 6 ? 16 : 1;
3290 return src_reg(reg_offset * message_header_scale);
3291 }
3292 }
3293
3294 /**
3295 * Emits an instruction before @inst to load the value named by @orig_src
3296 * from scratch space at @base_offset to @temp.
3297 *
3298 * @base_offset is measured in 32-byte units (the size of a register).
3299 */
3300 void
3301 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3302 dst_reg temp, src_reg orig_src,
3303 int base_offset)
3304 {
3305 int reg_offset = base_offset + orig_src.reg_offset;
3306 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3307 reg_offset);
3308
3309 emit_before(block, inst, SCRATCH_READ(temp, index));
3310 }
3311
3312 /**
3313 * Emits an instruction after @inst to store the value to be written
3314 * to @orig_dst to scratch space at @base_offset, from @temp.
3315 *
3316 * @base_offset is measured in 32-byte units (the size of a register).
3317 */
3318 void
3319 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3320 int base_offset)
3321 {
3322 int reg_offset = base_offset + inst->dst.reg_offset;
3323 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3324 reg_offset);
3325
3326 /* Create a temporary register to store *inst's result in.
3327 *
3328 * We have to be careful in MOVing from our temporary result register in
3329 * the scratch write. If we swizzle from channels of the temporary that
3330 * weren't initialized, it will confuse live interval analysis, which will
3331 * make spilling fail to make progress.
3332 */
3333 src_reg temp = src_reg(this, glsl_type::vec4_type);
3334 temp.type = inst->dst.type;
3335 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3336 int swizzles[4];
3337 for (int i = 0; i < 4; i++)
3338 if (inst->dst.writemask & (1 << i))
3339 swizzles[i] = i;
3340 else
3341 swizzles[i] = first_writemask_chan;
3342 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3343 swizzles[2], swizzles[3]);
3344
3345 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3346 inst->dst.writemask));
3347 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3348 write->predicate = inst->predicate;
3349 write->ir = inst->ir;
3350 write->annotation = inst->annotation;
3351 inst->insert_after(block, write);
3352
3353 inst->dst.file = temp.file;
3354 inst->dst.reg = temp.reg;
3355 inst->dst.reg_offset = temp.reg_offset;
3356 inst->dst.reladdr = NULL;
3357 }
3358
3359 /**
3360 * We can't generally support array access in GRF space, because a
3361 * single instruction's destination can only span 2 contiguous
3362 * registers. So, we send all GRF arrays that get variable index
3363 * access to scratch space.
3364 */
3365 void
3366 vec4_visitor::move_grf_array_access_to_scratch()
3367 {
3368 int scratch_loc[this->virtual_grf_count];
3369 memset(scratch_loc, -1, sizeof(scratch_loc));
3370
3371 /* First, calculate the set of virtual GRFs that need to be punted
3372 * to scratch due to having any array access on them, and where in
3373 * scratch.
3374 */
3375 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3376 if (inst->dst.file == GRF && inst->dst.reladdr &&
3377 scratch_loc[inst->dst.reg] == -1) {
3378 scratch_loc[inst->dst.reg] = c->last_scratch;
3379 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3380 }
3381
3382 for (int i = 0 ; i < 3; i++) {
3383 src_reg *src = &inst->src[i];
3384
3385 if (src->file == GRF && src->reladdr &&
3386 scratch_loc[src->reg] == -1) {
3387 scratch_loc[src->reg] = c->last_scratch;
3388 c->last_scratch += this->virtual_grf_sizes[src->reg];
3389 }
3390 }
3391 }
3392
3393 /* Now, for anything that will be accessed through scratch, rewrite
3394 * it to load/store. Note that this is a _safe list walk, because
3395 * we may generate a new scratch_write instruction after the one
3396 * we're processing.
3397 */
3398 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3399 /* Set up the annotation tracking for new generated instructions. */
3400 base_ir = inst->ir;
3401 current_annotation = inst->annotation;
3402
3403 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3404 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3405 }
3406
3407 for (int i = 0 ; i < 3; i++) {
3408 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3409 continue;
3410
3411 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3412
3413 emit_scratch_read(block, inst, temp, inst->src[i],
3414 scratch_loc[inst->src[i].reg]);
3415
3416 inst->src[i].file = temp.file;
3417 inst->src[i].reg = temp.reg;
3418 inst->src[i].reg_offset = temp.reg_offset;
3419 inst->src[i].reladdr = NULL;
3420 }
3421 }
3422 }
3423
3424 /**
3425 * Emits an instruction before @inst to load the value named by @orig_src
3426 * from the pull constant buffer (surface) at @base_offset to @temp.
3427 */
3428 void
3429 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3430 dst_reg temp, src_reg orig_src,
3431 int base_offset)
3432 {
3433 int reg_offset = base_offset + orig_src.reg_offset;
3434 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3435 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3436 reg_offset);
3437 vec4_instruction *load;
3438
3439 if (brw->gen >= 7) {
3440 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3441 grf_offset.type = offset.type;
3442 emit_before(block, inst, MOV(grf_offset, offset));
3443
3444 load = new(mem_ctx) vec4_instruction(this,
3445 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3446 temp, index, src_reg(grf_offset));
3447 } else {
3448 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3449 temp, index, offset);
3450 load->base_mrf = 14;
3451 load->mlen = 1;
3452 }
3453 emit_before(block, inst, load);
3454 }
3455
3456 /**
3457 * Implements array access of uniforms by inserting a
3458 * PULL_CONSTANT_LOAD instruction.
3459 *
3460 * Unlike temporary GRF array access (where we don't support it due to
3461 * the difficulty of doing relative addressing on instruction
3462 * destinations), we could potentially do array access of uniforms
3463 * that were loaded in GRF space as push constants. In real-world
3464 * usage we've seen, though, the arrays being used are always larger
3465 * than we could load as push constants, so just always move all
3466 * uniform array access out to a pull constant buffer.
3467 */
3468 void
3469 vec4_visitor::move_uniform_array_access_to_pull_constants()
3470 {
3471 int pull_constant_loc[this->uniforms];
3472 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3473 bool nested_reladdr;
3474
3475 /* Walk through and find array access of uniforms. Put a copy of that
3476 * uniform in the pull constant buffer.
3477 *
3478 * Note that we don't move constant-indexed accesses to arrays. No
3479 * testing has been done of the performance impact of this choice.
3480 */
3481 do {
3482 nested_reladdr = false;
3483
3484 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3485 for (int i = 0 ; i < 3; i++) {
3486 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3487 continue;
3488
3489 int uniform = inst->src[i].reg;
3490
3491 if (inst->src[i].reladdr->reladdr)
3492 nested_reladdr = true; /* will need another pass */
3493
3494 /* If this array isn't already present in the pull constant buffer,
3495 * add it.
3496 */
3497 if (pull_constant_loc[uniform] == -1) {
3498 const gl_constant_value **values =
3499 &stage_prog_data->param[uniform * 4];
3500
3501 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3502
3503 assert(uniform < uniform_array_size);
3504 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3505 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3506 = values[j];
3507 }
3508 }
3509
3510 /* Set up the annotation tracking for new generated instructions. */
3511 base_ir = inst->ir;
3512 current_annotation = inst->annotation;
3513
3514 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3515
3516 emit_pull_constant_load(block, inst, temp, inst->src[i],
3517 pull_constant_loc[uniform]);
3518
3519 inst->src[i].file = temp.file;
3520 inst->src[i].reg = temp.reg;
3521 inst->src[i].reg_offset = temp.reg_offset;
3522 inst->src[i].reladdr = NULL;
3523 }
3524 }
3525 } while (nested_reladdr);
3526
3527 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3528 * no need to track them as larger-than-vec4 objects. This will be
3529 * relied on in cutting out unused uniform vectors from push
3530 * constants.
3531 */
3532 split_uniform_registers();
3533 }
3534
3535 void
3536 vec4_visitor::resolve_ud_negate(src_reg *reg)
3537 {
3538 if (reg->type != BRW_REGISTER_TYPE_UD ||
3539 !reg->negate)
3540 return;
3541
3542 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3543 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3544 *reg = temp;
3545 }
3546
3547 /**
3548 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3549 *
3550 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3551 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3552 */
3553 void
3554 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3555 {
3556 assert(brw->gen <= 5);
3557
3558 if (!rvalue->type->is_boolean())
3559 return;
3560
3561 src_reg and_result = src_reg(this, rvalue->type);
3562 src_reg neg_result = src_reg(this, rvalue->type);
3563 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3564 emit(MOV(dst_reg(neg_result), negate(and_result)));
3565 *reg = neg_result;
3566 }
3567
3568 vec4_visitor::vec4_visitor(struct brw_context *brw,
3569 struct brw_vec4_compile *c,
3570 struct gl_program *prog,
3571 const struct brw_vec4_prog_key *key,
3572 struct brw_vec4_prog_data *prog_data,
3573 struct gl_shader_program *shader_prog,
3574 gl_shader_stage stage,
3575 void *mem_ctx,
3576 bool debug_flag,
3577 bool no_spills,
3578 shader_time_shader_type st_base,
3579 shader_time_shader_type st_written,
3580 shader_time_shader_type st_reset)
3581 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3582 c(c),
3583 key(key),
3584 prog_data(prog_data),
3585 sanity_param_count(0),
3586 fail_msg(NULL),
3587 first_non_payload_grf(0),
3588 need_all_constants_in_pull_buffer(false),
3589 debug_flag(debug_flag),
3590 no_spills(no_spills),
3591 st_base(st_base),
3592 st_written(st_written),
3593 st_reset(st_reset)
3594 {
3595 this->mem_ctx = mem_ctx;
3596 this->failed = false;
3597
3598 this->base_ir = NULL;
3599 this->current_annotation = NULL;
3600 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3601
3602 this->variable_ht = hash_table_ctor(0,
3603 hash_table_pointer_hash,
3604 hash_table_pointer_compare);
3605
3606 this->virtual_grf_start = NULL;
3607 this->virtual_grf_end = NULL;
3608 this->virtual_grf_sizes = NULL;
3609 this->virtual_grf_count = 0;
3610 this->virtual_grf_reg_map = NULL;
3611 this->virtual_grf_reg_count = 0;
3612 this->virtual_grf_array_size = 0;
3613 this->live_intervals = NULL;
3614
3615 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3616
3617 this->uniforms = 0;
3618
3619 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3620 * at least one. See setup_uniforms() in brw_vec4.cpp.
3621 */
3622 this->uniform_array_size = 1;
3623 if (prog_data) {
3624 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3625 }
3626
3627 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3628 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3629 }
3630
3631 vec4_visitor::~vec4_visitor()
3632 {
3633 hash_table_dtor(this->variable_ht);
3634 }
3635
3636
3637 void
3638 vec4_visitor::fail(const char *format, ...)
3639 {
3640 va_list va;
3641 char *msg;
3642
3643 if (failed)
3644 return;
3645
3646 failed = true;
3647
3648 va_start(va, format);
3649 msg = ralloc_vasprintf(mem_ctx, format, va);
3650 va_end(va);
3651 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3652
3653 this->fail_msg = msg;
3654
3655 if (debug_flag) {
3656 fprintf(stderr, "%s", msg);
3657 }
3658 }
3659
3660 } /* namespace brw */