i965/vec4: Optimize packSnorm4x8().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
70 vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(block, new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
82 const src_reg &src1, const src_reg &src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
91 const src_reg &src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
119 src0); \
120 }
121
122 #define ALU2(op) \
123 vec4_instruction * \
124 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
125 const src_reg &src1) \
126 { \
127 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
128 src0, src1); \
129 }
130
131 #define ALU2_ACC(op) \
132 vec4_instruction * \
133 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
134 const src_reg &src1) \
135 { \
136 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
137 BRW_OPCODE_##op, dst, src0, src1); \
138 inst->writes_accumulator = true; \
139 return inst; \
140 }
141
142 #define ALU3(op) \
143 vec4_instruction * \
144 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
145 const src_reg &src1, const src_reg &src2) \
146 { \
147 assert(brw->gen >= 6); \
148 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
149 src0, src1, src2); \
150 }
151
152 ALU1(NOT)
153 ALU1(MOV)
154 ALU1(FRC)
155 ALU1(RNDD)
156 ALU1(RNDE)
157 ALU1(RNDZ)
158 ALU1(F32TO16)
159 ALU1(F16TO32)
160 ALU2(ADD)
161 ALU2(MUL)
162 ALU2_ACC(MACH)
163 ALU2(AND)
164 ALU2(OR)
165 ALU2(XOR)
166 ALU2(DP3)
167 ALU2(DP4)
168 ALU2(DPH)
169 ALU2(SHL)
170 ALU2(SHR)
171 ALU2(ASR)
172 ALU3(LRP)
173 ALU1(BFREV)
174 ALU3(BFE)
175 ALU2(BFI1)
176 ALU3(BFI2)
177 ALU1(FBH)
178 ALU1(FBL)
179 ALU1(CBIT)
180 ALU3(MAD)
181 ALU2_ACC(ADDC)
182 ALU2_ACC(SUBB)
183 ALU2(MAC)
184
185 /** Gen4 predicated IF. */
186 vec4_instruction *
187 vec4_visitor::IF(enum brw_predicate predicate)
188 {
189 vec4_instruction *inst;
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
192 inst->predicate = predicate;
193
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 vec4_instruction *
199 vec4_visitor::IF(src_reg src0, src_reg src1,
200 enum brw_conditional_mod condition)
201 {
202 assert(brw->gen == 6);
203
204 vec4_instruction *inst;
205
206 resolve_ud_negate(&src0);
207 resolve_ud_negate(&src1);
208
209 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
210 src0, src1);
211 inst->conditional_mod = condition;
212
213 return inst;
214 }
215
216 /**
217 * CMP: Sets the low bit of the destination channels with the result
218 * of the comparison, while the upper bits are undefined, and updates
219 * the flag register with the packed 16 bits of the result.
220 */
221 vec4_instruction *
222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
223 enum brw_conditional_mod condition)
224 {
225 vec4_instruction *inst;
226
227 /* original gen4 does type conversion to the destination type
228 * before before comparison, producing garbage results for floating
229 * point comparisons.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 vec4_instruction *
247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
248 {
249 vec4_instruction *inst;
250
251 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
252 dst, index);
253 inst->base_mrf = 14;
254 inst->mlen = 2;
255
256 return inst;
257 }
258
259 vec4_instruction *
260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
261 const src_reg &index)
262 {
263 vec4_instruction *inst;
264
265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
266 dst, src, index);
267 inst->base_mrf = 13;
268 inst->mlen = 3;
269
270 return inst;
271 }
272
273 void
274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
275 {
276 static enum opcode dot_opcodes[] = {
277 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
278 };
279
280 emit(dot_opcodes[elements - 2], dst, src0, src1);
281 }
282
283 src_reg
284 vec4_visitor::fix_3src_operand(src_reg src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 src_reg
310 vec4_visitor::fix_math_operand(src_reg src)
311 {
312 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
313 return src;
314
315 /* The gen6 math instruction ignores the source modifiers --
316 * swizzle, abs, negate, and at least some parts of the register
317 * region description.
318 *
319 * Rather than trying to enumerate all these cases, *always* expand the
320 * operand to a temp GRF for gen6.
321 *
322 * For gen7, keep the operand as-is, except if immediate, which gen7 still
323 * can't use.
324 */
325
326 if (brw->gen == 7 && src.file != IMM)
327 return src;
328
329 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
330 expanded.type = src.type;
331 emit(MOV(expanded, src));
332 return src_reg(expanded);
333 }
334
335 void
336 vec4_visitor::emit_math(enum opcode opcode,
337 const dst_reg &dst,
338 const src_reg &src0, const src_reg &src1)
339 {
340 vec4_instruction *math =
341 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
342
343 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
344 /* MATH on Gen6 must be align1, so we can't do writemasks. */
345 math->dst = dst_reg(this, glsl_type::vec4_type);
346 math->dst.type = dst.type;
347 emit(MOV(dst, src_reg(math->dst)));
348 } else if (brw->gen < 6) {
349 math->base_mrf = 1;
350 math->mlen = src1.file == BAD_FILE ? 1 : 2;
351 }
352 }
353
354 void
355 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
356 {
357 if (brw->gen < 7) {
358 unreachable("ir_unop_pack_half_2x16 should be lowered");
359 }
360
361 assert(dst.type == BRW_REGISTER_TYPE_UD);
362 assert(src0.type == BRW_REGISTER_TYPE_F);
363
364 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
365 *
366 * Because this instruction does not have a 16-bit floating-point type,
367 * the destination data type must be Word (W).
368 *
369 * The destination must be DWord-aligned and specify a horizontal stride
370 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
371 * each destination channel and the upper word is not modified.
372 *
373 * The above restriction implies that the f32to16 instruction must use
374 * align1 mode, because only in align1 mode is it possible to specify
375 * horizontal stride. We choose here to defy the hardware docs and emit
376 * align16 instructions.
377 *
378 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
379 * instructions. I was partially successful in that the code passed all
380 * tests. However, the code was dubiously correct and fragile, and the
381 * tests were not harsh enough to probe that frailty. Not trusting the
382 * code, I chose instead to remain in align16 mode in defiance of the hw
383 * docs).
384 *
385 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
386 * simulator, emitting a f32to16 in align16 mode with UD as destination
387 * data type is safe. The behavior differs from that specified in the PRM
388 * in that the upper word of each destination channel is cleared to 0.
389 */
390
391 dst_reg tmp_dst(this, glsl_type::uvec2_type);
392 src_reg tmp_src(tmp_dst);
393
394 #if 0
395 /* Verify the undocumented behavior on which the following instructions
396 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
397 * then the result of the bit-or instruction below will be incorrect.
398 *
399 * You should inspect the disasm output in order to verify that the MOV is
400 * not optimized away.
401 */
402 emit(MOV(tmp_dst, src_reg(0x12345678u)));
403 #endif
404
405 /* Give tmp the form below, where "." means untouched.
406 *
407 * w z y x w z y x
408 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
409 *
410 * That the upper word of each write-channel be 0 is required for the
411 * following bit-shift and bit-or instructions to work. Note that this
412 * relies on the undocumented hardware behavior mentioned above.
413 */
414 tmp_dst.writemask = WRITEMASK_XY;
415 emit(F32TO16(tmp_dst, src0));
416
417 /* Give the write-channels of dst the form:
418 * 0xhhhh0000
419 */
420 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
421 emit(SHL(dst, tmp_src, src_reg(16u)));
422
423 /* Finally, give the write-channels of dst the form of packHalf2x16's
424 * output:
425 * 0xhhhhllll
426 */
427 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
428 emit(OR(dst, src_reg(dst), tmp_src));
429 }
430
431 void
432 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
433 {
434 if (brw->gen < 7) {
435 unreachable("ir_unop_unpack_half_2x16 should be lowered");
436 }
437
438 assert(dst.type == BRW_REGISTER_TYPE_F);
439 assert(src0.type == BRW_REGISTER_TYPE_UD);
440
441 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
442 *
443 * Because this instruction does not have a 16-bit floating-point type,
444 * the source data type must be Word (W). The destination type must be
445 * F (Float).
446 *
447 * To use W as the source data type, we must adjust horizontal strides,
448 * which is only possible in align1 mode. All my [chadv] attempts at
449 * emitting align1 instructions for unpackHalf2x16 failed to pass the
450 * Piglit tests, so I gave up.
451 *
452 * I've verified that, on gen7 hardware and the simulator, it is safe to
453 * emit f16to32 in align16 mode with UD as source data type.
454 */
455
456 dst_reg tmp_dst(this, glsl_type::uvec2_type);
457 src_reg tmp_src(tmp_dst);
458
459 tmp_dst.writemask = WRITEMASK_X;
460 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
461
462 tmp_dst.writemask = WRITEMASK_Y;
463 emit(SHR(tmp_dst, src0, src_reg(16u)));
464
465 dst.writemask = WRITEMASK_XY;
466 emit(F16TO32(dst, tmp_src));
467 }
468
469 void
470 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
471 {
472 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
473 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
474 * is not suitable to generate the shift values, but we can use the packed
475 * vector float and a type-converting MOV.
476 */
477 dst_reg shift(this, glsl_type::uvec4_type);
478 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
479
480 dst_reg shifted(this, glsl_type::uvec4_type);
481 src0.swizzle = BRW_SWIZZLE_XXXX;
482 emit(SHR(shifted, src0, src_reg(shift)));
483
484 shifted.type = BRW_REGISTER_TYPE_UB;
485 dst_reg f(this, glsl_type::vec4_type);
486 emit(MOV(f, src_reg(shifted)));
487
488 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
489 }
490
491 void
492 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
493 {
494 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
495 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
496 * is not suitable to generate the shift values, but we can use the packed
497 * vector float and a type-converting MOV.
498 */
499 dst_reg shift(this, glsl_type::uvec4_type);
500 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
501
502 dst_reg shifted(this, glsl_type::uvec4_type);
503 src0.swizzle = BRW_SWIZZLE_XXXX;
504 emit(SHR(shifted, src0, src_reg(shift)));
505
506 shifted.type = BRW_REGISTER_TYPE_B;
507 dst_reg f(this, glsl_type::vec4_type);
508 emit(MOV(f, src_reg(shifted)));
509
510 dst_reg scaled(this, glsl_type::vec4_type);
511 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
512
513 dst_reg max(this, glsl_type::vec4_type);
514 emit_minmax(BRW_CONDITIONAL_G, max, src_reg(scaled), src_reg(-1.0f));
515 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
516 }
517
518 void
519 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
520 {
521 dst_reg saturated(this, glsl_type::vec4_type);
522 vec4_instruction *inst = emit(MOV(saturated, src0));
523 inst->saturate = true;
524
525 dst_reg scaled(this, glsl_type::vec4_type);
526 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
527
528 dst_reg rounded(this, glsl_type::vec4_type);
529 emit(RNDE(rounded, src_reg(scaled)));
530
531 dst_reg u(this, glsl_type::uvec4_type);
532 emit(MOV(u, src_reg(rounded)));
533
534 src_reg bytes(u);
535 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
536 }
537
538 void
539 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
540 {
541 dst_reg max(this, glsl_type::vec4_type);
542 emit_minmax(BRW_CONDITIONAL_G, max, src0, src_reg(-1.0f));
543
544 dst_reg min(this, glsl_type::vec4_type);
545 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
546
547 dst_reg scaled(this, glsl_type::vec4_type);
548 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
549
550 dst_reg rounded(this, glsl_type::vec4_type);
551 emit(RNDE(rounded, src_reg(scaled)));
552
553 dst_reg i(this, glsl_type::ivec4_type);
554 emit(MOV(i, src_reg(rounded)));
555
556 src_reg bytes(i);
557 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
558 }
559
560 void
561 vec4_visitor::visit_instructions(const exec_list *list)
562 {
563 foreach_in_list(ir_instruction, ir, list) {
564 base_ir = ir;
565 ir->accept(this);
566 }
567 }
568
569
570 static int
571 type_size(const struct glsl_type *type)
572 {
573 unsigned int i;
574 int size;
575
576 switch (type->base_type) {
577 case GLSL_TYPE_UINT:
578 case GLSL_TYPE_INT:
579 case GLSL_TYPE_FLOAT:
580 case GLSL_TYPE_BOOL:
581 if (type->is_matrix()) {
582 return type->matrix_columns;
583 } else {
584 /* Regardless of size of vector, it gets a vec4. This is bad
585 * packing for things like floats, but otherwise arrays become a
586 * mess. Hopefully a later pass over the code can pack scalars
587 * down if appropriate.
588 */
589 return 1;
590 }
591 case GLSL_TYPE_ARRAY:
592 assert(type->length > 0);
593 return type_size(type->fields.array) * type->length;
594 case GLSL_TYPE_STRUCT:
595 size = 0;
596 for (i = 0; i < type->length; i++) {
597 size += type_size(type->fields.structure[i].type);
598 }
599 return size;
600 case GLSL_TYPE_SAMPLER:
601 /* Samplers take up no register space, since they're baked in at
602 * link time.
603 */
604 return 0;
605 case GLSL_TYPE_ATOMIC_UINT:
606 return 0;
607 case GLSL_TYPE_IMAGE:
608 case GLSL_TYPE_VOID:
609 case GLSL_TYPE_ERROR:
610 case GLSL_TYPE_INTERFACE:
611 unreachable("not reached");
612 }
613
614 return 0;
615 }
616
617 int
618 vec4_visitor::virtual_grf_alloc(int size)
619 {
620 if (virtual_grf_array_size <= virtual_grf_count) {
621 if (virtual_grf_array_size == 0)
622 virtual_grf_array_size = 16;
623 else
624 virtual_grf_array_size *= 2;
625 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
626 virtual_grf_array_size);
627 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
628 virtual_grf_array_size);
629 }
630 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
631 virtual_grf_reg_count += size;
632 virtual_grf_sizes[virtual_grf_count] = size;
633 return virtual_grf_count++;
634 }
635
636 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
637 {
638 init();
639
640 this->file = GRF;
641 this->reg = v->virtual_grf_alloc(type_size(type));
642
643 if (type->is_array() || type->is_record()) {
644 this->swizzle = BRW_SWIZZLE_NOOP;
645 } else {
646 this->swizzle = swizzle_for_size(type->vector_elements);
647 }
648
649 this->type = brw_type_for_base_type(type);
650 }
651
652 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
653 {
654 assert(size > 0);
655
656 init();
657
658 this->file = GRF;
659 this->reg = v->virtual_grf_alloc(type_size(type) * size);
660
661 this->swizzle = BRW_SWIZZLE_NOOP;
662
663 this->type = brw_type_for_base_type(type);
664 }
665
666 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
667 {
668 init();
669
670 this->file = GRF;
671 this->reg = v->virtual_grf_alloc(type_size(type));
672
673 if (type->is_array() || type->is_record()) {
674 this->writemask = WRITEMASK_XYZW;
675 } else {
676 this->writemask = (1 << type->vector_elements) - 1;
677 }
678
679 this->type = brw_type_for_base_type(type);
680 }
681
682 /* Our support for uniforms is piggy-backed on the struct
683 * gl_fragment_program, because that's where the values actually
684 * get stored, rather than in some global gl_shader_program uniform
685 * store.
686 */
687 void
688 vec4_visitor::setup_uniform_values(ir_variable *ir)
689 {
690 int namelen = strlen(ir->name);
691
692 /* The data for our (non-builtin) uniforms is stored in a series of
693 * gl_uniform_driver_storage structs for each subcomponent that
694 * glGetUniformLocation() could name. We know it's been set up in the same
695 * order we'd walk the type, so walk the list of storage and find anything
696 * with our name, or the prefix of a component that starts with our name.
697 */
698 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
699 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
700
701 if (strncmp(ir->name, storage->name, namelen) != 0 ||
702 (storage->name[namelen] != 0 &&
703 storage->name[namelen] != '.' &&
704 storage->name[namelen] != '[')) {
705 continue;
706 }
707
708 gl_constant_value *components = storage->storage;
709 unsigned vector_count = (MAX2(storage->array_elements, 1) *
710 storage->type->matrix_columns);
711
712 for (unsigned s = 0; s < vector_count; s++) {
713 assert(uniforms < uniform_array_size);
714 uniform_vector_size[uniforms] = storage->type->vector_elements;
715
716 int i;
717 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
718 stage_prog_data->param[uniforms * 4 + i] = components;
719 components++;
720 }
721 for (; i < 4; i++) {
722 static gl_constant_value zero = { 0.0 };
723 stage_prog_data->param[uniforms * 4 + i] = &zero;
724 }
725
726 uniforms++;
727 }
728 }
729 }
730
731 void
732 vec4_visitor::setup_uniform_clipplane_values()
733 {
734 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
735
736 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
737 assert(this->uniforms < uniform_array_size);
738 this->uniform_vector_size[this->uniforms] = 4;
739 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
740 this->userplane[i].type = BRW_REGISTER_TYPE_F;
741 for (int j = 0; j < 4; ++j) {
742 stage_prog_data->param[this->uniforms * 4 + j] =
743 (gl_constant_value *) &clip_planes[i][j];
744 }
745 ++this->uniforms;
746 }
747 }
748
749 /* Our support for builtin uniforms is even scarier than non-builtin.
750 * It sits on top of the PROG_STATE_VAR parameters that are
751 * automatically updated from GL context state.
752 */
753 void
754 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
755 {
756 const ir_state_slot *const slots = ir->get_state_slots();
757 assert(slots != NULL);
758
759 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
760 /* This state reference has already been setup by ir_to_mesa,
761 * but we'll get the same index back here. We can reference
762 * ParameterValues directly, since unlike brw_fs.cpp, we never
763 * add new state references during compile.
764 */
765 int index = _mesa_add_state_reference(this->prog->Parameters,
766 (gl_state_index *)slots[i].tokens);
767 gl_constant_value *values =
768 &this->prog->Parameters->ParameterValues[index][0];
769
770 assert(this->uniforms < uniform_array_size);
771 this->uniform_vector_size[this->uniforms] = 0;
772 /* Add each of the unique swizzled channels of the element.
773 * This will end up matching the size of the glsl_type of this field.
774 */
775 int last_swiz = -1;
776 for (unsigned int j = 0; j < 4; j++) {
777 int swiz = GET_SWZ(slots[i].swizzle, j);
778 last_swiz = swiz;
779
780 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
781 assert(this->uniforms < uniform_array_size);
782 if (swiz <= last_swiz)
783 this->uniform_vector_size[this->uniforms]++;
784 }
785 this->uniforms++;
786 }
787 }
788
789 dst_reg *
790 vec4_visitor::variable_storage(ir_variable *var)
791 {
792 return (dst_reg *)hash_table_find(this->variable_ht, var);
793 }
794
795 void
796 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
797 enum brw_predicate *predicate)
798 {
799 ir_expression *expr = ir->as_expression();
800
801 *predicate = BRW_PREDICATE_NORMAL;
802
803 if (expr && expr->operation != ir_binop_ubo_load) {
804 src_reg op[3];
805 vec4_instruction *inst;
806
807 assert(expr->get_num_operands() <= 3);
808 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
809 expr->operands[i]->accept(this);
810 op[i] = this->result;
811
812 resolve_ud_negate(&op[i]);
813 }
814
815 switch (expr->operation) {
816 case ir_unop_logic_not:
817 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
818 inst->conditional_mod = BRW_CONDITIONAL_Z;
819 break;
820
821 case ir_binop_logic_xor:
822 inst = emit(XOR(dst_null_d(), op[0], op[1]));
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_binop_logic_or:
827 inst = emit(OR(dst_null_d(), op[0], op[1]));
828 inst->conditional_mod = BRW_CONDITIONAL_NZ;
829 break;
830
831 case ir_binop_logic_and:
832 inst = emit(AND(dst_null_d(), op[0], op[1]));
833 inst->conditional_mod = BRW_CONDITIONAL_NZ;
834 break;
835
836 case ir_unop_f2b:
837 if (brw->gen >= 6) {
838 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
839 } else {
840 inst = emit(MOV(dst_null_f(), op[0]));
841 inst->conditional_mod = BRW_CONDITIONAL_NZ;
842 }
843 break;
844
845 case ir_unop_i2b:
846 if (brw->gen >= 6) {
847 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
848 } else {
849 inst = emit(MOV(dst_null_d(), op[0]));
850 inst->conditional_mod = BRW_CONDITIONAL_NZ;
851 }
852 break;
853
854 case ir_binop_all_equal:
855 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
856 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
857 break;
858
859 case ir_binop_any_nequal:
860 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
861 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
862 break;
863
864 case ir_unop_any:
865 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
866 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
867 break;
868
869 case ir_binop_greater:
870 case ir_binop_gequal:
871 case ir_binop_less:
872 case ir_binop_lequal:
873 case ir_binop_equal:
874 case ir_binop_nequal:
875 emit(CMP(dst_null_d(), op[0], op[1],
876 brw_conditional_for_comparison(expr->operation)));
877 break;
878
879 case ir_triop_csel: {
880 /* Expand the boolean condition into the flag register. */
881 inst = emit(MOV(dst_null_d(), op[0]));
882 inst->conditional_mod = BRW_CONDITIONAL_NZ;
883
884 /* Select which boolean to return. */
885 dst_reg temp(this, expr->operands[1]->type);
886 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
887 inst->predicate = BRW_PREDICATE_NORMAL;
888
889 /* Expand the result to a condition code. */
890 inst = emit(MOV(dst_null_d(), src_reg(temp)));
891 inst->conditional_mod = BRW_CONDITIONAL_NZ;
892 break;
893 }
894
895 default:
896 unreachable("not reached");
897 }
898 return;
899 }
900
901 ir->accept(this);
902
903 resolve_ud_negate(&this->result);
904
905 if (brw->gen >= 6) {
906 vec4_instruction *inst = emit(AND(dst_null_d(),
907 this->result, src_reg(1)));
908 inst->conditional_mod = BRW_CONDITIONAL_NZ;
909 } else {
910 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
911 inst->conditional_mod = BRW_CONDITIONAL_NZ;
912 }
913 }
914
915 /**
916 * Emit a gen6 IF statement with the comparison folded into the IF
917 * instruction.
918 */
919 void
920 vec4_visitor::emit_if_gen6(ir_if *ir)
921 {
922 ir_expression *expr = ir->condition->as_expression();
923
924 if (expr && expr->operation != ir_binop_ubo_load) {
925 src_reg op[3];
926 dst_reg temp;
927
928 assert(expr->get_num_operands() <= 3);
929 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
930 expr->operands[i]->accept(this);
931 op[i] = this->result;
932 }
933
934 switch (expr->operation) {
935 case ir_unop_logic_not:
936 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
937 return;
938
939 case ir_binop_logic_xor:
940 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
941 return;
942
943 case ir_binop_logic_or:
944 temp = dst_reg(this, glsl_type::bool_type);
945 emit(OR(temp, op[0], op[1]));
946 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
947 return;
948
949 case ir_binop_logic_and:
950 temp = dst_reg(this, glsl_type::bool_type);
951 emit(AND(temp, op[0], op[1]));
952 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
953 return;
954
955 case ir_unop_f2b:
956 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
957 return;
958
959 case ir_unop_i2b:
960 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
961 return;
962
963 case ir_binop_greater:
964 case ir_binop_gequal:
965 case ir_binop_less:
966 case ir_binop_lequal:
967 case ir_binop_equal:
968 case ir_binop_nequal:
969 emit(IF(op[0], op[1],
970 brw_conditional_for_comparison(expr->operation)));
971 return;
972
973 case ir_binop_all_equal:
974 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
975 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
976 return;
977
978 case ir_binop_any_nequal:
979 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
980 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
981 return;
982
983 case ir_unop_any:
984 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
985 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
986 return;
987
988 case ir_triop_csel: {
989 /* Expand the boolean condition into the flag register. */
990 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
991 inst->conditional_mod = BRW_CONDITIONAL_NZ;
992
993 /* Select which boolean to return. */
994 dst_reg temp(this, expr->operands[1]->type);
995 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
996 inst->predicate = BRW_PREDICATE_NORMAL;
997
998 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
999 return;
1000 }
1001
1002 default:
1003 unreachable("not reached");
1004 }
1005 return;
1006 }
1007
1008 ir->condition->accept(this);
1009
1010 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1011 }
1012
1013 void
1014 vec4_visitor::visit(ir_variable *ir)
1015 {
1016 dst_reg *reg = NULL;
1017
1018 if (variable_storage(ir))
1019 return;
1020
1021 switch (ir->data.mode) {
1022 case ir_var_shader_in:
1023 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1024 break;
1025
1026 case ir_var_shader_out:
1027 reg = new(mem_ctx) dst_reg(this, ir->type);
1028
1029 for (int i = 0; i < type_size(ir->type); i++) {
1030 output_reg[ir->data.location + i] = *reg;
1031 output_reg[ir->data.location + i].reg_offset = i;
1032 output_reg[ir->data.location + i].type =
1033 brw_type_for_base_type(ir->type->get_scalar_type());
1034 output_reg_annotation[ir->data.location + i] = ir->name;
1035 }
1036 break;
1037
1038 case ir_var_auto:
1039 case ir_var_temporary:
1040 reg = new(mem_ctx) dst_reg(this, ir->type);
1041 break;
1042
1043 case ir_var_uniform:
1044 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1045
1046 /* Thanks to the lower_ubo_reference pass, we will see only
1047 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1048 * variables, so no need for them to be in variable_ht.
1049 *
1050 * Some uniforms, such as samplers and atomic counters, have no actual
1051 * storage, so we should ignore them.
1052 */
1053 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1054 return;
1055
1056 /* Track how big the whole uniform variable is, in case we need to put a
1057 * copy of its data into pull constants for array access.
1058 */
1059 assert(this->uniforms < uniform_array_size);
1060 this->uniform_size[this->uniforms] = type_size(ir->type);
1061
1062 if (!strncmp(ir->name, "gl_", 3)) {
1063 setup_builtin_uniform_values(ir);
1064 } else {
1065 setup_uniform_values(ir);
1066 }
1067 break;
1068
1069 case ir_var_system_value:
1070 reg = make_reg_for_system_value(ir);
1071 break;
1072
1073 default:
1074 unreachable("not reached");
1075 }
1076
1077 reg->type = brw_type_for_base_type(ir->type);
1078 hash_table_insert(this->variable_ht, reg, ir);
1079 }
1080
1081 void
1082 vec4_visitor::visit(ir_loop *ir)
1083 {
1084 /* We don't want debugging output to print the whole body of the
1085 * loop as the annotation.
1086 */
1087 this->base_ir = NULL;
1088
1089 emit(BRW_OPCODE_DO);
1090
1091 visit_instructions(&ir->body_instructions);
1092
1093 emit(BRW_OPCODE_WHILE);
1094 }
1095
1096 void
1097 vec4_visitor::visit(ir_loop_jump *ir)
1098 {
1099 switch (ir->mode) {
1100 case ir_loop_jump::jump_break:
1101 emit(BRW_OPCODE_BREAK);
1102 break;
1103 case ir_loop_jump::jump_continue:
1104 emit(BRW_OPCODE_CONTINUE);
1105 break;
1106 }
1107 }
1108
1109
1110 void
1111 vec4_visitor::visit(ir_function_signature *)
1112 {
1113 unreachable("not reached");
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_function *ir)
1118 {
1119 /* Ignore function bodies other than main() -- we shouldn't see calls to
1120 * them since they should all be inlined.
1121 */
1122 if (strcmp(ir->name, "main") == 0) {
1123 const ir_function_signature *sig;
1124 exec_list empty;
1125
1126 sig = ir->matching_signature(NULL, &empty, false);
1127
1128 assert(sig);
1129
1130 visit_instructions(&sig->body);
1131 }
1132 }
1133
1134 bool
1135 vec4_visitor::try_emit_mad(ir_expression *ir)
1136 {
1137 /* 3-src instructions were introduced in gen6. */
1138 if (brw->gen < 6)
1139 return false;
1140
1141 /* MAD can only handle floating-point data. */
1142 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1143 return false;
1144
1145 ir_rvalue *nonmul = ir->operands[1];
1146 ir_expression *mul = ir->operands[0]->as_expression();
1147
1148 if (!mul || mul->operation != ir_binop_mul) {
1149 nonmul = ir->operands[0];
1150 mul = ir->operands[1]->as_expression();
1151
1152 if (!mul || mul->operation != ir_binop_mul)
1153 return false;
1154 }
1155
1156 nonmul->accept(this);
1157 src_reg src0 = fix_3src_operand(this->result);
1158
1159 mul->operands[0]->accept(this);
1160 src_reg src1 = fix_3src_operand(this->result);
1161
1162 mul->operands[1]->accept(this);
1163 src_reg src2 = fix_3src_operand(this->result);
1164
1165 this->result = src_reg(this, ir->type);
1166 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1167
1168 return true;
1169 }
1170
1171 bool
1172 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1173 {
1174 /* This optimization relies on CMP setting the destination to 0 when
1175 * false. Early hardware only sets the least significant bit, and
1176 * leaves the other bits undefined. So we can't use it.
1177 */
1178 if (brw->gen < 6)
1179 return false;
1180
1181 ir_expression *const cmp = ir->operands[0]->as_expression();
1182
1183 if (cmp == NULL)
1184 return false;
1185
1186 switch (cmp->operation) {
1187 case ir_binop_less:
1188 case ir_binop_greater:
1189 case ir_binop_lequal:
1190 case ir_binop_gequal:
1191 case ir_binop_equal:
1192 case ir_binop_nequal:
1193 break;
1194
1195 default:
1196 return false;
1197 }
1198
1199 cmp->operands[0]->accept(this);
1200 const src_reg cmp_src0 = this->result;
1201
1202 cmp->operands[1]->accept(this);
1203 const src_reg cmp_src1 = this->result;
1204
1205 this->result = src_reg(this, ir->type);
1206
1207 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1208 brw_conditional_for_comparison(cmp->operation)));
1209
1210 /* If the comparison is false, this->result will just happen to be zero.
1211 */
1212 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1213 this->result, src_reg(1.0f));
1214 inst->predicate = BRW_PREDICATE_NORMAL;
1215 inst->predicate_inverse = true;
1216
1217 return true;
1218 }
1219
1220 void
1221 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1222 src_reg src0, src_reg src1)
1223 {
1224 vec4_instruction *inst;
1225
1226 if (brw->gen >= 6) {
1227 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1228 inst->conditional_mod = conditionalmod;
1229 } else {
1230 emit(CMP(dst, src0, src1, conditionalmod));
1231
1232 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1233 inst->predicate = BRW_PREDICATE_NORMAL;
1234 }
1235 }
1236
1237 void
1238 vec4_visitor::emit_lrp(const dst_reg &dst,
1239 const src_reg &x, const src_reg &y, const src_reg &a)
1240 {
1241 if (brw->gen >= 6) {
1242 /* Note that the instruction's argument order is reversed from GLSL
1243 * and the IR.
1244 */
1245 emit(LRP(dst,
1246 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1247 } else {
1248 /* Earlier generations don't support three source operations, so we
1249 * need to emit x*(1-a) + y*a.
1250 */
1251 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1252 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1253 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1254 y_times_a.writemask = dst.writemask;
1255 one_minus_a.writemask = dst.writemask;
1256 x_times_one_minus_a.writemask = dst.writemask;
1257
1258 emit(MUL(y_times_a, y, a));
1259 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1260 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1261 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1262 }
1263 }
1264
1265 void
1266 vec4_visitor::visit(ir_expression *ir)
1267 {
1268 unsigned int operand;
1269 src_reg op[Elements(ir->operands)];
1270 vec4_instruction *inst;
1271
1272 if (ir->operation == ir_binop_add) {
1273 if (try_emit_mad(ir))
1274 return;
1275 }
1276
1277 if (ir->operation == ir_unop_b2f) {
1278 if (try_emit_b2f_of_compare(ir))
1279 return;
1280 }
1281
1282 /* Storage for our result. Ideally for an assignment we'd be using
1283 * the actual storage for the result here, instead.
1284 */
1285 dst_reg result_dst(this, ir->type);
1286 src_reg result_src(result_dst);
1287
1288 if (ir->operation == ir_triop_csel) {
1289 ir->operands[1]->accept(this);
1290 op[1] = this->result;
1291 ir->operands[2]->accept(this);
1292 op[2] = this->result;
1293
1294 enum brw_predicate predicate;
1295 emit_bool_to_cond_code(ir->operands[0], &predicate);
1296 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1297 inst->predicate = predicate;
1298 this->result = result_src;
1299 return;
1300 }
1301
1302 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1303 this->result.file = BAD_FILE;
1304 ir->operands[operand]->accept(this);
1305 if (this->result.file == BAD_FILE) {
1306 fprintf(stderr, "Failed to get tree for expression operand:\n");
1307 ir->operands[operand]->fprint(stderr);
1308 exit(1);
1309 }
1310 op[operand] = this->result;
1311
1312 /* Matrix expression operands should have been broken down to vector
1313 * operations already.
1314 */
1315 assert(!ir->operands[operand]->type->is_matrix());
1316 }
1317
1318 /* If nothing special happens, this is the result. */
1319 this->result = result_src;
1320
1321 switch (ir->operation) {
1322 case ir_unop_logic_not:
1323 if (ctx->Const.UniformBooleanTrue != 1) {
1324 emit(NOT(result_dst, op[0]));
1325 } else {
1326 emit(XOR(result_dst, op[0], src_reg(1u)));
1327 }
1328 break;
1329 case ir_unop_neg:
1330 op[0].negate = !op[0].negate;
1331 emit(MOV(result_dst, op[0]));
1332 break;
1333 case ir_unop_abs:
1334 op[0].abs = true;
1335 op[0].negate = false;
1336 emit(MOV(result_dst, op[0]));
1337 break;
1338
1339 case ir_unop_sign:
1340 if (ir->type->is_float()) {
1341 /* AND(val, 0x80000000) gives the sign bit.
1342 *
1343 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1344 * zero.
1345 */
1346 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1347
1348 op[0].type = BRW_REGISTER_TYPE_UD;
1349 result_dst.type = BRW_REGISTER_TYPE_UD;
1350 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1351
1352 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1353 inst->predicate = BRW_PREDICATE_NORMAL;
1354
1355 this->result.type = BRW_REGISTER_TYPE_F;
1356 } else {
1357 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1358 * -> non-negative val generates 0x00000000.
1359 * Predicated OR sets 1 if val is positive.
1360 */
1361 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1362
1363 emit(ASR(result_dst, op[0], src_reg(31)));
1364
1365 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1366 inst->predicate = BRW_PREDICATE_NORMAL;
1367 }
1368 break;
1369
1370 case ir_unop_rcp:
1371 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1372 break;
1373
1374 case ir_unop_exp2:
1375 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1376 break;
1377 case ir_unop_log2:
1378 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1379 break;
1380 case ir_unop_exp:
1381 case ir_unop_log:
1382 unreachable("not reached: should be handled by ir_explog_to_explog2");
1383 case ir_unop_sin:
1384 case ir_unop_sin_reduced:
1385 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1386 break;
1387 case ir_unop_cos:
1388 case ir_unop_cos_reduced:
1389 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1390 break;
1391
1392 case ir_unop_dFdx:
1393 case ir_unop_dFdx_coarse:
1394 case ir_unop_dFdx_fine:
1395 case ir_unop_dFdy:
1396 case ir_unop_dFdy_coarse:
1397 case ir_unop_dFdy_fine:
1398 unreachable("derivatives not valid in vertex shader");
1399
1400 case ir_unop_bitfield_reverse:
1401 emit(BFREV(result_dst, op[0]));
1402 break;
1403 case ir_unop_bit_count:
1404 emit(CBIT(result_dst, op[0]));
1405 break;
1406 case ir_unop_find_msb: {
1407 src_reg temp = src_reg(this, glsl_type::uint_type);
1408
1409 inst = emit(FBH(dst_reg(temp), op[0]));
1410 inst->dst.writemask = WRITEMASK_XYZW;
1411
1412 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1413 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1414 * subtract the result from 31 to convert the MSB count into an LSB count.
1415 */
1416
1417 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1418 temp.swizzle = BRW_SWIZZLE_NOOP;
1419 emit(MOV(result_dst, temp));
1420
1421 src_reg src_tmp = src_reg(result_dst);
1422 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1423
1424 src_tmp.negate = true;
1425 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1426 inst->predicate = BRW_PREDICATE_NORMAL;
1427 break;
1428 }
1429 case ir_unop_find_lsb:
1430 emit(FBL(result_dst, op[0]));
1431 break;
1432 case ir_unop_saturate:
1433 inst = emit(MOV(result_dst, op[0]));
1434 inst->saturate = true;
1435 break;
1436
1437 case ir_unop_noise:
1438 unreachable("not reached: should be handled by lower_noise");
1439
1440 case ir_binop_add:
1441 emit(ADD(result_dst, op[0], op[1]));
1442 break;
1443 case ir_binop_sub:
1444 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1445
1446 case ir_binop_mul:
1447 if (brw->gen < 8 && ir->type->is_integer()) {
1448 /* For integer multiplication, the MUL uses the low 16 bits of one of
1449 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1450 * accumulates in the contribution of the upper 16 bits of that
1451 * operand. If we can determine that one of the args is in the low
1452 * 16 bits, though, we can just emit a single MUL.
1453 */
1454 if (ir->operands[0]->is_uint16_constant()) {
1455 if (brw->gen < 7)
1456 emit(MUL(result_dst, op[0], op[1]));
1457 else
1458 emit(MUL(result_dst, op[1], op[0]));
1459 } else if (ir->operands[1]->is_uint16_constant()) {
1460 if (brw->gen < 7)
1461 emit(MUL(result_dst, op[1], op[0]));
1462 else
1463 emit(MUL(result_dst, op[0], op[1]));
1464 } else {
1465 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1466
1467 emit(MUL(acc, op[0], op[1]));
1468 emit(MACH(dst_null_d(), op[0], op[1]));
1469 emit(MOV(result_dst, src_reg(acc)));
1470 }
1471 } else {
1472 emit(MUL(result_dst, op[0], op[1]));
1473 }
1474 break;
1475 case ir_binop_imul_high: {
1476 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1477
1478 emit(MUL(acc, op[0], op[1]));
1479 emit(MACH(result_dst, op[0], op[1]));
1480 break;
1481 }
1482 case ir_binop_div:
1483 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1484 assert(ir->type->is_integer());
1485 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1486 break;
1487 case ir_binop_carry: {
1488 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1489
1490 emit(ADDC(dst_null_ud(), op[0], op[1]));
1491 emit(MOV(result_dst, src_reg(acc)));
1492 break;
1493 }
1494 case ir_binop_borrow: {
1495 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1496
1497 emit(SUBB(dst_null_ud(), op[0], op[1]));
1498 emit(MOV(result_dst, src_reg(acc)));
1499 break;
1500 }
1501 case ir_binop_mod:
1502 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1503 assert(ir->type->is_integer());
1504 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1505 break;
1506
1507 case ir_binop_less:
1508 case ir_binop_greater:
1509 case ir_binop_lequal:
1510 case ir_binop_gequal:
1511 case ir_binop_equal:
1512 case ir_binop_nequal: {
1513 emit(CMP(result_dst, op[0], op[1],
1514 brw_conditional_for_comparison(ir->operation)));
1515 if (ctx->Const.UniformBooleanTrue == 1) {
1516 emit(AND(result_dst, result_src, src_reg(1u)));
1517 }
1518 break;
1519 }
1520
1521 case ir_binop_all_equal:
1522 /* "==" operator producing a scalar boolean. */
1523 if (ir->operands[0]->type->is_vector() ||
1524 ir->operands[1]->type->is_vector()) {
1525 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1526 emit(MOV(result_dst, src_reg(0)));
1527 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1528 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1529 } else {
1530 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1531 if (ctx->Const.UniformBooleanTrue == 1) {
1532 emit(AND(result_dst, result_src, src_reg(1u)));
1533 }
1534 }
1535 break;
1536 case ir_binop_any_nequal:
1537 /* "!=" operator producing a scalar boolean. */
1538 if (ir->operands[0]->type->is_vector() ||
1539 ir->operands[1]->type->is_vector()) {
1540 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1541
1542 emit(MOV(result_dst, src_reg(0)));
1543 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1544 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1545 } else {
1546 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1547 if (ctx->Const.UniformBooleanTrue == 1) {
1548 emit(AND(result_dst, result_src, src_reg(1u)));
1549 }
1550 }
1551 break;
1552
1553 case ir_unop_any:
1554 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1555 emit(MOV(result_dst, src_reg(0)));
1556
1557 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1558 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1559 break;
1560
1561 case ir_binop_logic_xor:
1562 emit(XOR(result_dst, op[0], op[1]));
1563 break;
1564
1565 case ir_binop_logic_or:
1566 emit(OR(result_dst, op[0], op[1]));
1567 break;
1568
1569 case ir_binop_logic_and:
1570 emit(AND(result_dst, op[0], op[1]));
1571 break;
1572
1573 case ir_binop_dot:
1574 assert(ir->operands[0]->type->is_vector());
1575 assert(ir->operands[0]->type == ir->operands[1]->type);
1576 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1577 break;
1578
1579 case ir_unop_sqrt:
1580 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1581 break;
1582 case ir_unop_rsq:
1583 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1584 break;
1585
1586 case ir_unop_bitcast_i2f:
1587 case ir_unop_bitcast_u2f:
1588 this->result = op[0];
1589 this->result.type = BRW_REGISTER_TYPE_F;
1590 break;
1591
1592 case ir_unop_bitcast_f2i:
1593 this->result = op[0];
1594 this->result.type = BRW_REGISTER_TYPE_D;
1595 break;
1596
1597 case ir_unop_bitcast_f2u:
1598 this->result = op[0];
1599 this->result.type = BRW_REGISTER_TYPE_UD;
1600 break;
1601
1602 case ir_unop_i2f:
1603 case ir_unop_i2u:
1604 case ir_unop_u2i:
1605 case ir_unop_u2f:
1606 case ir_unop_f2i:
1607 case ir_unop_f2u:
1608 emit(MOV(result_dst, op[0]));
1609 break;
1610 case ir_unop_b2i:
1611 if (ctx->Const.UniformBooleanTrue != 1) {
1612 emit(AND(result_dst, op[0], src_reg(1u)));
1613 } else {
1614 emit(MOV(result_dst, op[0]));
1615 }
1616 break;
1617 case ir_unop_b2f:
1618 if (ctx->Const.UniformBooleanTrue != 1) {
1619 op[0].type = BRW_REGISTER_TYPE_UD;
1620 result_dst.type = BRW_REGISTER_TYPE_UD;
1621 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1622 result_dst.type = BRW_REGISTER_TYPE_F;
1623 } else {
1624 emit(MOV(result_dst, op[0]));
1625 }
1626 break;
1627 case ir_unop_f2b:
1628 case ir_unop_i2b:
1629 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1630 if (ctx->Const.UniformBooleanTrue == 1) {
1631 emit(AND(result_dst, result_src, src_reg(1u)));
1632 }
1633 break;
1634
1635 case ir_unop_trunc:
1636 emit(RNDZ(result_dst, op[0]));
1637 break;
1638 case ir_unop_ceil:
1639 op[0].negate = !op[0].negate;
1640 inst = emit(RNDD(result_dst, op[0]));
1641 this->result.negate = true;
1642 break;
1643 case ir_unop_floor:
1644 inst = emit(RNDD(result_dst, op[0]));
1645 break;
1646 case ir_unop_fract:
1647 inst = emit(FRC(result_dst, op[0]));
1648 break;
1649 case ir_unop_round_even:
1650 emit(RNDE(result_dst, op[0]));
1651 break;
1652
1653 case ir_binop_min:
1654 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1655 break;
1656 case ir_binop_max:
1657 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1658 break;
1659
1660 case ir_binop_pow:
1661 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1662 break;
1663
1664 case ir_unop_bit_not:
1665 inst = emit(NOT(result_dst, op[0]));
1666 break;
1667 case ir_binop_bit_and:
1668 inst = emit(AND(result_dst, op[0], op[1]));
1669 break;
1670 case ir_binop_bit_xor:
1671 inst = emit(XOR(result_dst, op[0], op[1]));
1672 break;
1673 case ir_binop_bit_or:
1674 inst = emit(OR(result_dst, op[0], op[1]));
1675 break;
1676
1677 case ir_binop_lshift:
1678 inst = emit(SHL(result_dst, op[0], op[1]));
1679 break;
1680
1681 case ir_binop_rshift:
1682 if (ir->type->base_type == GLSL_TYPE_INT)
1683 inst = emit(ASR(result_dst, op[0], op[1]));
1684 else
1685 inst = emit(SHR(result_dst, op[0], op[1]));
1686 break;
1687
1688 case ir_binop_bfm:
1689 emit(BFI1(result_dst, op[0], op[1]));
1690 break;
1691
1692 case ir_binop_ubo_load: {
1693 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1694 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1695 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1696 src_reg offset;
1697
1698 /* Now, load the vector from that offset. */
1699 assert(ir->type->is_vector() || ir->type->is_scalar());
1700
1701 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1702 packed_consts.type = result.type;
1703 src_reg surf_index;
1704
1705 if (const_uniform_block) {
1706 /* The block index is a constant, so just emit the binding table entry
1707 * as an immediate.
1708 */
1709 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1710 const_uniform_block->value.u[0]);
1711 } else {
1712 /* The block index is not a constant. Evaluate the index expression
1713 * per-channel and add the base UBO index; the generator will select
1714 * a value from any live channel.
1715 */
1716 surf_index = src_reg(this, glsl_type::uint_type);
1717 emit(ADD(dst_reg(surf_index), op[0],
1718 src_reg(prog_data->base.binding_table.ubo_start)));
1719
1720 /* Assume this may touch any UBO. It would be nice to provide
1721 * a tighter bound, but the array information is already lowered away.
1722 */
1723 brw_mark_surface_used(&prog_data->base,
1724 prog_data->base.binding_table.ubo_start +
1725 shader_prog->NumUniformBlocks - 1);
1726 }
1727
1728 if (const_offset_ir) {
1729 if (brw->gen >= 8) {
1730 /* Store the offset in a GRF so we can send-from-GRF. */
1731 offset = src_reg(this, glsl_type::int_type);
1732 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1733 } else {
1734 /* Immediates are fine on older generations since they'll be moved
1735 * to a (potentially fake) MRF at the generator level.
1736 */
1737 offset = src_reg(const_offset / 16);
1738 }
1739 } else {
1740 offset = src_reg(this, glsl_type::uint_type);
1741 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1742 }
1743
1744 if (brw->gen >= 7) {
1745 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1746 grf_offset.type = offset.type;
1747
1748 emit(MOV(grf_offset, offset));
1749
1750 emit(new(mem_ctx) vec4_instruction(this,
1751 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1752 dst_reg(packed_consts),
1753 surf_index,
1754 src_reg(grf_offset)));
1755 } else {
1756 vec4_instruction *pull =
1757 emit(new(mem_ctx) vec4_instruction(this,
1758 VS_OPCODE_PULL_CONSTANT_LOAD,
1759 dst_reg(packed_consts),
1760 surf_index,
1761 offset));
1762 pull->base_mrf = 14;
1763 pull->mlen = 1;
1764 }
1765
1766 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1767 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1768 const_offset % 16 / 4,
1769 const_offset % 16 / 4,
1770 const_offset % 16 / 4);
1771
1772 /* UBO bools are any nonzero int. We need to convert them to use the
1773 * value of true stored in ctx->Const.UniformBooleanTrue.
1774 */
1775 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1776 emit(CMP(result_dst, packed_consts, src_reg(0u),
1777 BRW_CONDITIONAL_NZ));
1778 if (ctx->Const.UniformBooleanTrue == 1) {
1779 emit(AND(result_dst, result, src_reg(1u)));
1780 }
1781 } else {
1782 emit(MOV(result_dst, packed_consts));
1783 }
1784 break;
1785 }
1786
1787 case ir_binop_vector_extract:
1788 unreachable("should have been lowered by vec_index_to_cond_assign");
1789
1790 case ir_triop_fma:
1791 op[0] = fix_3src_operand(op[0]);
1792 op[1] = fix_3src_operand(op[1]);
1793 op[2] = fix_3src_operand(op[2]);
1794 /* Note that the instruction's argument order is reversed from GLSL
1795 * and the IR.
1796 */
1797 emit(MAD(result_dst, op[2], op[1], op[0]));
1798 break;
1799
1800 case ir_triop_lrp:
1801 emit_lrp(result_dst, op[0], op[1], op[2]);
1802 break;
1803
1804 case ir_triop_csel:
1805 unreachable("already handled above");
1806 break;
1807
1808 case ir_triop_bfi:
1809 op[0] = fix_3src_operand(op[0]);
1810 op[1] = fix_3src_operand(op[1]);
1811 op[2] = fix_3src_operand(op[2]);
1812 emit(BFI2(result_dst, op[0], op[1], op[2]));
1813 break;
1814
1815 case ir_triop_bitfield_extract:
1816 op[0] = fix_3src_operand(op[0]);
1817 op[1] = fix_3src_operand(op[1]);
1818 op[2] = fix_3src_operand(op[2]);
1819 /* Note that the instruction's argument order is reversed from GLSL
1820 * and the IR.
1821 */
1822 emit(BFE(result_dst, op[2], op[1], op[0]));
1823 break;
1824
1825 case ir_triop_vector_insert:
1826 unreachable("should have been lowered by lower_vector_insert");
1827
1828 case ir_quadop_bitfield_insert:
1829 unreachable("not reached: should be handled by "
1830 "bitfield_insert_to_bfm_bfi\n");
1831
1832 case ir_quadop_vector:
1833 unreachable("not reached: should be handled by lower_quadop_vector");
1834
1835 case ir_unop_pack_half_2x16:
1836 emit_pack_half_2x16(result_dst, op[0]);
1837 break;
1838 case ir_unop_unpack_half_2x16:
1839 emit_unpack_half_2x16(result_dst, op[0]);
1840 break;
1841 case ir_unop_unpack_unorm_4x8:
1842 emit_unpack_unorm_4x8(result_dst, op[0]);
1843 break;
1844 case ir_unop_unpack_snorm_4x8:
1845 emit_unpack_snorm_4x8(result_dst, op[0]);
1846 break;
1847 case ir_unop_pack_unorm_4x8:
1848 emit_pack_unorm_4x8(result_dst, op[0]);
1849 break;
1850 case ir_unop_pack_snorm_4x8:
1851 emit_pack_snorm_4x8(result_dst, op[0]);
1852 break;
1853 case ir_unop_pack_snorm_2x16:
1854 case ir_unop_pack_unorm_2x16:
1855 case ir_unop_unpack_snorm_2x16:
1856 case ir_unop_unpack_unorm_2x16:
1857 unreachable("not reached: should be handled by lower_packing_builtins");
1858 case ir_unop_unpack_half_2x16_split_x:
1859 case ir_unop_unpack_half_2x16_split_y:
1860 case ir_binop_pack_half_2x16_split:
1861 case ir_unop_interpolate_at_centroid:
1862 case ir_binop_interpolate_at_sample:
1863 case ir_binop_interpolate_at_offset:
1864 unreachable("not reached: should not occur in vertex shader");
1865 case ir_binop_ldexp:
1866 unreachable("not reached: should be handled by ldexp_to_arith()");
1867 }
1868 }
1869
1870
1871 void
1872 vec4_visitor::visit(ir_swizzle *ir)
1873 {
1874 src_reg src;
1875 int i = 0;
1876 int swizzle[4];
1877
1878 /* Note that this is only swizzles in expressions, not those on the left
1879 * hand side of an assignment, which do write masking. See ir_assignment
1880 * for that.
1881 */
1882
1883 ir->val->accept(this);
1884 src = this->result;
1885 assert(src.file != BAD_FILE);
1886
1887 for (i = 0; i < ir->type->vector_elements; i++) {
1888 switch (i) {
1889 case 0:
1890 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1891 break;
1892 case 1:
1893 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1894 break;
1895 case 2:
1896 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1897 break;
1898 case 3:
1899 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1900 break;
1901 }
1902 }
1903 for (; i < 4; i++) {
1904 /* Replicate the last channel out. */
1905 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1906 }
1907
1908 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1909
1910 this->result = src;
1911 }
1912
1913 void
1914 vec4_visitor::visit(ir_dereference_variable *ir)
1915 {
1916 const struct glsl_type *type = ir->type;
1917 dst_reg *reg = variable_storage(ir->var);
1918
1919 if (!reg) {
1920 fail("Failed to find variable storage for %s\n", ir->var->name);
1921 this->result = src_reg(brw_null_reg());
1922 return;
1923 }
1924
1925 this->result = src_reg(*reg);
1926
1927 /* System values get their swizzle from the dst_reg writemask */
1928 if (ir->var->data.mode == ir_var_system_value)
1929 return;
1930
1931 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1932 this->result.swizzle = swizzle_for_size(type->vector_elements);
1933 }
1934
1935
1936 int
1937 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1938 {
1939 /* Under normal circumstances array elements are stored consecutively, so
1940 * the stride is equal to the size of the array element.
1941 */
1942 return type_size(ir->type);
1943 }
1944
1945
1946 void
1947 vec4_visitor::visit(ir_dereference_array *ir)
1948 {
1949 ir_constant *constant_index;
1950 src_reg src;
1951 int array_stride = compute_array_stride(ir);
1952
1953 constant_index = ir->array_index->constant_expression_value();
1954
1955 ir->array->accept(this);
1956 src = this->result;
1957
1958 if (constant_index) {
1959 src.reg_offset += constant_index->value.i[0] * array_stride;
1960 } else {
1961 /* Variable index array dereference. It eats the "vec4" of the
1962 * base of the array and an index that offsets the Mesa register
1963 * index.
1964 */
1965 ir->array_index->accept(this);
1966
1967 src_reg index_reg;
1968
1969 if (array_stride == 1) {
1970 index_reg = this->result;
1971 } else {
1972 index_reg = src_reg(this, glsl_type::int_type);
1973
1974 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1975 }
1976
1977 if (src.reladdr) {
1978 src_reg temp = src_reg(this, glsl_type::int_type);
1979
1980 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1981
1982 index_reg = temp;
1983 }
1984
1985 src.reladdr = ralloc(mem_ctx, src_reg);
1986 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1987 }
1988
1989 /* If the type is smaller than a vec4, replicate the last channel out. */
1990 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1991 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1992 else
1993 src.swizzle = BRW_SWIZZLE_NOOP;
1994 src.type = brw_type_for_base_type(ir->type);
1995
1996 this->result = src;
1997 }
1998
1999 void
2000 vec4_visitor::visit(ir_dereference_record *ir)
2001 {
2002 unsigned int i;
2003 const glsl_type *struct_type = ir->record->type;
2004 int offset = 0;
2005
2006 ir->record->accept(this);
2007
2008 for (i = 0; i < struct_type->length; i++) {
2009 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2010 break;
2011 offset += type_size(struct_type->fields.structure[i].type);
2012 }
2013
2014 /* If the type is smaller than a vec4, replicate the last channel out. */
2015 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2016 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2017 else
2018 this->result.swizzle = BRW_SWIZZLE_NOOP;
2019 this->result.type = brw_type_for_base_type(ir->type);
2020
2021 this->result.reg_offset += offset;
2022 }
2023
2024 /**
2025 * We want to be careful in assignment setup to hit the actual storage
2026 * instead of potentially using a temporary like we might with the
2027 * ir_dereference handler.
2028 */
2029 static dst_reg
2030 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2031 {
2032 /* The LHS must be a dereference. If the LHS is a variable indexed array
2033 * access of a vector, it must be separated into a series conditional moves
2034 * before reaching this point (see ir_vec_index_to_cond_assign).
2035 */
2036 assert(ir->as_dereference());
2037 ir_dereference_array *deref_array = ir->as_dereference_array();
2038 if (deref_array) {
2039 assert(!deref_array->array->type->is_vector());
2040 }
2041
2042 /* Use the rvalue deref handler for the most part. We'll ignore
2043 * swizzles in it and write swizzles using writemask, though.
2044 */
2045 ir->accept(v);
2046 return dst_reg(v->result);
2047 }
2048
2049 void
2050 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2051 const struct glsl_type *type,
2052 enum brw_predicate predicate)
2053 {
2054 if (type->base_type == GLSL_TYPE_STRUCT) {
2055 for (unsigned int i = 0; i < type->length; i++) {
2056 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2057 }
2058 return;
2059 }
2060
2061 if (type->is_array()) {
2062 for (unsigned int i = 0; i < type->length; i++) {
2063 emit_block_move(dst, src, type->fields.array, predicate);
2064 }
2065 return;
2066 }
2067
2068 if (type->is_matrix()) {
2069 const struct glsl_type *vec_type;
2070
2071 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2072 type->vector_elements, 1);
2073
2074 for (int i = 0; i < type->matrix_columns; i++) {
2075 emit_block_move(dst, src, vec_type, predicate);
2076 }
2077 return;
2078 }
2079
2080 assert(type->is_scalar() || type->is_vector());
2081
2082 dst->type = brw_type_for_base_type(type);
2083 src->type = dst->type;
2084
2085 dst->writemask = (1 << type->vector_elements) - 1;
2086
2087 src->swizzle = swizzle_for_size(type->vector_elements);
2088
2089 vec4_instruction *inst = emit(MOV(*dst, *src));
2090 inst->predicate = predicate;
2091
2092 dst->reg_offset++;
2093 src->reg_offset++;
2094 }
2095
2096
2097 /* If the RHS processing resulted in an instruction generating a
2098 * temporary value, and it would be easy to rewrite the instruction to
2099 * generate its result right into the LHS instead, do so. This ends
2100 * up reliably removing instructions where it can be tricky to do so
2101 * later without real UD chain information.
2102 */
2103 bool
2104 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2105 dst_reg dst,
2106 src_reg src,
2107 vec4_instruction *pre_rhs_inst,
2108 vec4_instruction *last_rhs_inst)
2109 {
2110 /* This could be supported, but it would take more smarts. */
2111 if (ir->condition)
2112 return false;
2113
2114 if (pre_rhs_inst == last_rhs_inst)
2115 return false; /* No instructions generated to work with. */
2116
2117 /* Make sure the last instruction generated our source reg. */
2118 if (src.file != GRF ||
2119 src.file != last_rhs_inst->dst.file ||
2120 src.reg != last_rhs_inst->dst.reg ||
2121 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2122 src.reladdr ||
2123 src.abs ||
2124 src.negate ||
2125 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2126 return false;
2127
2128 /* Check that that last instruction fully initialized the channels
2129 * we want to use, in the order we want to use them. We could
2130 * potentially reswizzle the operands of many instructions so that
2131 * we could handle out of order channels, but don't yet.
2132 */
2133
2134 for (unsigned i = 0; i < 4; i++) {
2135 if (dst.writemask & (1 << i)) {
2136 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2137 return false;
2138
2139 if (BRW_GET_SWZ(src.swizzle, i) != i)
2140 return false;
2141 }
2142 }
2143
2144 /* Success! Rewrite the instruction. */
2145 last_rhs_inst->dst.file = dst.file;
2146 last_rhs_inst->dst.reg = dst.reg;
2147 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2148 last_rhs_inst->dst.reladdr = dst.reladdr;
2149 last_rhs_inst->dst.writemask &= dst.writemask;
2150
2151 return true;
2152 }
2153
2154 void
2155 vec4_visitor::visit(ir_assignment *ir)
2156 {
2157 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2158 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2159
2160 if (!ir->lhs->type->is_scalar() &&
2161 !ir->lhs->type->is_vector()) {
2162 ir->rhs->accept(this);
2163 src_reg src = this->result;
2164
2165 if (ir->condition) {
2166 emit_bool_to_cond_code(ir->condition, &predicate);
2167 }
2168
2169 /* emit_block_move doesn't account for swizzles in the source register.
2170 * This should be ok, since the source register is a structure or an
2171 * array, and those can't be swizzled. But double-check to be sure.
2172 */
2173 assert(src.swizzle ==
2174 (ir->rhs->type->is_matrix()
2175 ? swizzle_for_size(ir->rhs->type->vector_elements)
2176 : BRW_SWIZZLE_NOOP));
2177
2178 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2179 return;
2180 }
2181
2182 /* Now we're down to just a scalar/vector with writemasks. */
2183 int i;
2184
2185 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2186 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2187
2188 ir->rhs->accept(this);
2189
2190 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2191
2192 src_reg src = this->result;
2193
2194 int swizzles[4];
2195 int first_enabled_chan = 0;
2196 int src_chan = 0;
2197
2198 assert(ir->lhs->type->is_vector() ||
2199 ir->lhs->type->is_scalar());
2200 dst.writemask = ir->write_mask;
2201
2202 for (int i = 0; i < 4; i++) {
2203 if (dst.writemask & (1 << i)) {
2204 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2205 break;
2206 }
2207 }
2208
2209 /* Swizzle a small RHS vector into the channels being written.
2210 *
2211 * glsl ir treats write_mask as dictating how many channels are
2212 * present on the RHS while in our instructions we need to make
2213 * those channels appear in the slots of the vec4 they're written to.
2214 */
2215 for (int i = 0; i < 4; i++) {
2216 if (dst.writemask & (1 << i))
2217 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2218 else
2219 swizzles[i] = first_enabled_chan;
2220 }
2221 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2222 swizzles[2], swizzles[3]);
2223
2224 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2225 return;
2226 }
2227
2228 if (ir->condition) {
2229 emit_bool_to_cond_code(ir->condition, &predicate);
2230 }
2231
2232 for (i = 0; i < type_size(ir->lhs->type); i++) {
2233 vec4_instruction *inst = emit(MOV(dst, src));
2234 inst->predicate = predicate;
2235
2236 dst.reg_offset++;
2237 src.reg_offset++;
2238 }
2239 }
2240
2241 void
2242 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2243 {
2244 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2245 foreach_in_list(ir_constant, field_value, &ir->components) {
2246 emit_constant_values(dst, field_value);
2247 }
2248 return;
2249 }
2250
2251 if (ir->type->is_array()) {
2252 for (unsigned int i = 0; i < ir->type->length; i++) {
2253 emit_constant_values(dst, ir->array_elements[i]);
2254 }
2255 return;
2256 }
2257
2258 if (ir->type->is_matrix()) {
2259 for (int i = 0; i < ir->type->matrix_columns; i++) {
2260 float *vec = &ir->value.f[i * ir->type->vector_elements];
2261
2262 for (int j = 0; j < ir->type->vector_elements; j++) {
2263 dst->writemask = 1 << j;
2264 dst->type = BRW_REGISTER_TYPE_F;
2265
2266 emit(MOV(*dst, src_reg(vec[j])));
2267 }
2268 dst->reg_offset++;
2269 }
2270 return;
2271 }
2272
2273 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2274
2275 for (int i = 0; i < ir->type->vector_elements; i++) {
2276 if (!(remaining_writemask & (1 << i)))
2277 continue;
2278
2279 dst->writemask = 1 << i;
2280 dst->type = brw_type_for_base_type(ir->type);
2281
2282 /* Find other components that match the one we're about to
2283 * write. Emits fewer instructions for things like vec4(0.5,
2284 * 1.5, 1.5, 1.5).
2285 */
2286 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2287 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2288 if (ir->value.b[i] == ir->value.b[j])
2289 dst->writemask |= (1 << j);
2290 } else {
2291 /* u, i, and f storage all line up, so no need for a
2292 * switch case for comparing each type.
2293 */
2294 if (ir->value.u[i] == ir->value.u[j])
2295 dst->writemask |= (1 << j);
2296 }
2297 }
2298
2299 switch (ir->type->base_type) {
2300 case GLSL_TYPE_FLOAT:
2301 emit(MOV(*dst, src_reg(ir->value.f[i])));
2302 break;
2303 case GLSL_TYPE_INT:
2304 emit(MOV(*dst, src_reg(ir->value.i[i])));
2305 break;
2306 case GLSL_TYPE_UINT:
2307 emit(MOV(*dst, src_reg(ir->value.u[i])));
2308 break;
2309 case GLSL_TYPE_BOOL:
2310 emit(MOV(*dst,
2311 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2312 : 0u)));
2313 break;
2314 default:
2315 unreachable("Non-float/uint/int/bool constant");
2316 }
2317
2318 remaining_writemask &= ~dst->writemask;
2319 }
2320 dst->reg_offset++;
2321 }
2322
2323 void
2324 vec4_visitor::visit(ir_constant *ir)
2325 {
2326 dst_reg dst = dst_reg(this, ir->type);
2327 this->result = src_reg(dst);
2328
2329 emit_constant_values(&dst, ir);
2330 }
2331
2332 void
2333 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2334 {
2335 ir_dereference *deref = static_cast<ir_dereference *>(
2336 ir->actual_parameters.get_head());
2337 ir_variable *location = deref->variable_referenced();
2338 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2339 location->data.binding);
2340
2341 /* Calculate the surface offset */
2342 src_reg offset(this, glsl_type::uint_type);
2343 ir_dereference_array *deref_array = deref->as_dereference_array();
2344 if (deref_array) {
2345 deref_array->array_index->accept(this);
2346
2347 src_reg tmp(this, glsl_type::uint_type);
2348 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2349 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2350 } else {
2351 offset = location->data.atomic.offset;
2352 }
2353
2354 /* Emit the appropriate machine instruction */
2355 const char *callee = ir->callee->function_name();
2356 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2357
2358 if (!strcmp("__intrinsic_atomic_read", callee)) {
2359 emit_untyped_surface_read(surf_index, dst, offset);
2360
2361 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2362 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2363 src_reg(), src_reg());
2364
2365 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2366 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2367 src_reg(), src_reg());
2368 }
2369 }
2370
2371 void
2372 vec4_visitor::visit(ir_call *ir)
2373 {
2374 const char *callee = ir->callee->function_name();
2375
2376 if (!strcmp("__intrinsic_atomic_read", callee) ||
2377 !strcmp("__intrinsic_atomic_increment", callee) ||
2378 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2379 visit_atomic_counter_intrinsic(ir);
2380 } else {
2381 unreachable("Unsupported intrinsic.");
2382 }
2383 }
2384
2385 src_reg
2386 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2387 {
2388 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2389 inst->base_mrf = 2;
2390 inst->mlen = 1;
2391 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2392 inst->dst.writemask = WRITEMASK_XYZW;
2393
2394 inst->src[1] = sampler;
2395
2396 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2397 int param_base = inst->base_mrf;
2398 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2399 int zero_mask = 0xf & ~coord_mask;
2400
2401 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2402 coordinate));
2403
2404 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2405 src_reg(0)));
2406
2407 emit(inst);
2408 return src_reg(inst->dst);
2409 }
2410
2411 static bool
2412 is_high_sampler(struct brw_context *brw, src_reg sampler)
2413 {
2414 if (brw->gen < 8 && !brw->is_haswell)
2415 return false;
2416
2417 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2418 }
2419
2420 void
2421 vec4_visitor::visit(ir_texture *ir)
2422 {
2423 uint32_t sampler =
2424 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2425
2426 ir_rvalue *nonconst_sampler_index =
2427 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2428
2429 /* Handle non-constant sampler array indexing */
2430 src_reg sampler_reg;
2431 if (nonconst_sampler_index) {
2432 /* The highest sampler which may be used by this operation is
2433 * the last element of the array. Mark it here, because the generator
2434 * doesn't have enough information to determine the bound.
2435 */
2436 uint32_t array_size = ir->sampler->as_dereference_array()
2437 ->array->type->array_size();
2438
2439 uint32_t max_used = sampler + array_size - 1;
2440 if (ir->op == ir_tg4 && brw->gen < 8) {
2441 max_used += prog_data->base.binding_table.gather_texture_start;
2442 } else {
2443 max_used += prog_data->base.binding_table.texture_start;
2444 }
2445
2446 brw_mark_surface_used(&prog_data->base, max_used);
2447
2448 /* Emit code to evaluate the actual indexing expression */
2449 nonconst_sampler_index->accept(this);
2450 dst_reg temp(this, glsl_type::uint_type);
2451 emit(ADD(temp, this->result, src_reg(sampler)))
2452 ->force_writemask_all = true;
2453 sampler_reg = src_reg(temp);
2454 } else {
2455 /* Single sampler, or constant array index; the indexing expression
2456 * is just an immediate.
2457 */
2458 sampler_reg = src_reg(sampler);
2459 }
2460
2461 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2462 * emitting anything other than setting up the constant result.
2463 */
2464 if (ir->op == ir_tg4) {
2465 ir_constant *chan = ir->lod_info.component->as_constant();
2466 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2467 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2468 dst_reg result(this, ir->type);
2469 this->result = src_reg(result);
2470 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2471 return;
2472 }
2473 }
2474
2475 /* Should be lowered by do_lower_texture_projection */
2476 assert(!ir->projector);
2477
2478 /* Should be lowered */
2479 assert(!ir->offset || !ir->offset->type->is_array());
2480
2481 /* Generate code to compute all the subexpression trees. This has to be
2482 * done before loading any values into MRFs for the sampler message since
2483 * generating these values may involve SEND messages that need the MRFs.
2484 */
2485 src_reg coordinate;
2486 if (ir->coordinate) {
2487 ir->coordinate->accept(this);
2488 coordinate = this->result;
2489 }
2490
2491 src_reg shadow_comparitor;
2492 if (ir->shadow_comparitor) {
2493 ir->shadow_comparitor->accept(this);
2494 shadow_comparitor = this->result;
2495 }
2496
2497 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2498 src_reg offset_value;
2499 if (has_nonconstant_offset) {
2500 ir->offset->accept(this);
2501 offset_value = src_reg(this->result);
2502 }
2503
2504 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2505 src_reg lod, dPdx, dPdy, sample_index, mcs;
2506 switch (ir->op) {
2507 case ir_tex:
2508 lod = src_reg(0.0f);
2509 lod_type = glsl_type::float_type;
2510 break;
2511 case ir_txf:
2512 case ir_txl:
2513 case ir_txs:
2514 ir->lod_info.lod->accept(this);
2515 lod = this->result;
2516 lod_type = ir->lod_info.lod->type;
2517 break;
2518 case ir_query_levels:
2519 lod = src_reg(0);
2520 lod_type = glsl_type::int_type;
2521 break;
2522 case ir_txf_ms:
2523 ir->lod_info.sample_index->accept(this);
2524 sample_index = this->result;
2525 sample_index_type = ir->lod_info.sample_index->type;
2526
2527 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2528 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2529 else
2530 mcs = src_reg(0u);
2531 break;
2532 case ir_txd:
2533 ir->lod_info.grad.dPdx->accept(this);
2534 dPdx = this->result;
2535
2536 ir->lod_info.grad.dPdy->accept(this);
2537 dPdy = this->result;
2538
2539 lod_type = ir->lod_info.grad.dPdx->type;
2540 break;
2541 case ir_txb:
2542 case ir_lod:
2543 case ir_tg4:
2544 break;
2545 }
2546
2547 enum opcode opcode;
2548 switch (ir->op) {
2549 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2550 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2551 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2552 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2553 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2554 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2555 case ir_tg4: opcode = has_nonconstant_offset
2556 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2557 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2558 case ir_txb:
2559 unreachable("TXB is not valid for vertex shaders.");
2560 case ir_lod:
2561 unreachable("LOD is not valid for vertex shaders.");
2562 default:
2563 unreachable("Unrecognized tex op");
2564 }
2565
2566 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2567
2568 if (ir->offset != NULL && !has_nonconstant_offset) {
2569 inst->offset =
2570 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2571 ir->offset->type->vector_elements);
2572 }
2573
2574 /* Stuff the channel select bits in the top of the texture offset */
2575 if (ir->op == ir_tg4)
2576 inst->offset |= gather_channel(ir, sampler) << 16;
2577
2578 /* The message header is necessary for:
2579 * - Gen4 (always)
2580 * - Texel offsets
2581 * - Gather channel selection
2582 * - Sampler indices too large to fit in a 4-bit value.
2583 */
2584 inst->header_present =
2585 brw->gen < 5 || inst->offset != 0 || ir->op == ir_tg4 ||
2586 is_high_sampler(brw, sampler_reg);
2587 inst->base_mrf = 2;
2588 inst->mlen = inst->header_present + 1; /* always at least one */
2589 inst->dst = dst_reg(this, ir->type);
2590 inst->dst.writemask = WRITEMASK_XYZW;
2591 inst->shadow_compare = ir->shadow_comparitor != NULL;
2592
2593 inst->src[1] = sampler_reg;
2594
2595 /* MRF for the first parameter */
2596 int param_base = inst->base_mrf + inst->header_present;
2597
2598 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2599 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2600 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2601 } else {
2602 /* Load the coordinate */
2603 /* FINISHME: gl_clamp_mask and saturate */
2604 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2605 int zero_mask = 0xf & ~coord_mask;
2606
2607 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2608 coordinate));
2609
2610 if (zero_mask != 0) {
2611 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2612 src_reg(0)));
2613 }
2614 /* Load the shadow comparitor */
2615 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2616 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2617 WRITEMASK_X),
2618 shadow_comparitor));
2619 inst->mlen++;
2620 }
2621
2622 /* Load the LOD info */
2623 if (ir->op == ir_tex || ir->op == ir_txl) {
2624 int mrf, writemask;
2625 if (brw->gen >= 5) {
2626 mrf = param_base + 1;
2627 if (ir->shadow_comparitor) {
2628 writemask = WRITEMASK_Y;
2629 /* mlen already incremented */
2630 } else {
2631 writemask = WRITEMASK_X;
2632 inst->mlen++;
2633 }
2634 } else /* brw->gen == 4 */ {
2635 mrf = param_base;
2636 writemask = WRITEMASK_W;
2637 }
2638 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2639 } else if (ir->op == ir_txf) {
2640 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2641 } else if (ir->op == ir_txf_ms) {
2642 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2643 sample_index));
2644 if (brw->gen >= 7) {
2645 /* MCS data is in the first channel of `mcs`, but we need to get it into
2646 * the .y channel of the second vec4 of params, so replicate .x across
2647 * the whole vec4 and then mask off everything except .y
2648 */
2649 mcs.swizzle = BRW_SWIZZLE_XXXX;
2650 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2651 mcs));
2652 }
2653 inst->mlen++;
2654 } else if (ir->op == ir_txd) {
2655 const glsl_type *type = lod_type;
2656
2657 if (brw->gen >= 5) {
2658 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2659 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2660 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2661 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2662 inst->mlen++;
2663
2664 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2665 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2666 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2667 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2668 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2669 inst->mlen++;
2670
2671 if (ir->shadow_comparitor) {
2672 emit(MOV(dst_reg(MRF, param_base + 2,
2673 ir->shadow_comparitor->type, WRITEMASK_Z),
2674 shadow_comparitor));
2675 }
2676 }
2677 } else /* brw->gen == 4 */ {
2678 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2679 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2680 inst->mlen += 2;
2681 }
2682 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2683 if (ir->shadow_comparitor) {
2684 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2685 shadow_comparitor));
2686 }
2687
2688 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2689 offset_value));
2690 inst->mlen++;
2691 }
2692 }
2693
2694 emit(inst);
2695
2696 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2697 * spec requires layers.
2698 */
2699 if (ir->op == ir_txs) {
2700 glsl_type const *type = ir->sampler->type;
2701 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2702 type->sampler_array) {
2703 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2704 writemask(inst->dst, WRITEMASK_Z),
2705 src_reg(inst->dst), src_reg(6));
2706 }
2707 }
2708
2709 if (brw->gen == 6 && ir->op == ir_tg4) {
2710 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2711 }
2712
2713 swizzle_result(ir, src_reg(inst->dst), sampler);
2714 }
2715
2716 /**
2717 * Apply workarounds for Gen6 gather with UINT/SINT
2718 */
2719 void
2720 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2721 {
2722 if (!wa)
2723 return;
2724
2725 int width = (wa & WA_8BIT) ? 8 : 16;
2726 dst_reg dst_f = dst;
2727 dst_f.type = BRW_REGISTER_TYPE_F;
2728
2729 /* Convert from UNORM to UINT */
2730 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2731 emit(MOV(dst, src_reg(dst_f)));
2732
2733 if (wa & WA_SIGN) {
2734 /* Reinterpret the UINT value as a signed INT value by
2735 * shifting the sign bit into place, then shifting back
2736 * preserving sign.
2737 */
2738 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2739 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2740 }
2741 }
2742
2743 /**
2744 * Set up the gather channel based on the swizzle, for gather4.
2745 */
2746 uint32_t
2747 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2748 {
2749 ir_constant *chan = ir->lod_info.component->as_constant();
2750 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2751 switch (swiz) {
2752 case SWIZZLE_X: return 0;
2753 case SWIZZLE_Y:
2754 /* gather4 sampler is broken for green channel on RG32F --
2755 * we must ask for blue instead.
2756 */
2757 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2758 return 2;
2759 return 1;
2760 case SWIZZLE_Z: return 2;
2761 case SWIZZLE_W: return 3;
2762 default:
2763 unreachable("Not reached"); /* zero, one swizzles handled already */
2764 }
2765 }
2766
2767 void
2768 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2769 {
2770 int s = key->tex.swizzles[sampler];
2771
2772 this->result = src_reg(this, ir->type);
2773 dst_reg swizzled_result(this->result);
2774
2775 if (ir->op == ir_query_levels) {
2776 /* # levels is in .w */
2777 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2778 emit(MOV(swizzled_result, orig_val));
2779 return;
2780 }
2781
2782 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2783 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2784 emit(MOV(swizzled_result, orig_val));
2785 return;
2786 }
2787
2788
2789 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2790 int swizzle[4] = {0};
2791
2792 for (int i = 0; i < 4; i++) {
2793 switch (GET_SWZ(s, i)) {
2794 case SWIZZLE_ZERO:
2795 zero_mask |= (1 << i);
2796 break;
2797 case SWIZZLE_ONE:
2798 one_mask |= (1 << i);
2799 break;
2800 default:
2801 copy_mask |= (1 << i);
2802 swizzle[i] = GET_SWZ(s, i);
2803 break;
2804 }
2805 }
2806
2807 if (copy_mask) {
2808 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2809 swizzled_result.writemask = copy_mask;
2810 emit(MOV(swizzled_result, orig_val));
2811 }
2812
2813 if (zero_mask) {
2814 swizzled_result.writemask = zero_mask;
2815 emit(MOV(swizzled_result, src_reg(0.0f)));
2816 }
2817
2818 if (one_mask) {
2819 swizzled_result.writemask = one_mask;
2820 emit(MOV(swizzled_result, src_reg(1.0f)));
2821 }
2822 }
2823
2824 void
2825 vec4_visitor::visit(ir_return *)
2826 {
2827 unreachable("not reached");
2828 }
2829
2830 void
2831 vec4_visitor::visit(ir_discard *)
2832 {
2833 unreachable("not reached");
2834 }
2835
2836 void
2837 vec4_visitor::visit(ir_if *ir)
2838 {
2839 /* Don't point the annotation at the if statement, because then it plus
2840 * the then and else blocks get printed.
2841 */
2842 this->base_ir = ir->condition;
2843
2844 if (brw->gen == 6) {
2845 emit_if_gen6(ir);
2846 } else {
2847 enum brw_predicate predicate;
2848 emit_bool_to_cond_code(ir->condition, &predicate);
2849 emit(IF(predicate));
2850 }
2851
2852 visit_instructions(&ir->then_instructions);
2853
2854 if (!ir->else_instructions.is_empty()) {
2855 this->base_ir = ir->condition;
2856 emit(BRW_OPCODE_ELSE);
2857
2858 visit_instructions(&ir->else_instructions);
2859 }
2860
2861 this->base_ir = ir->condition;
2862 emit(BRW_OPCODE_ENDIF);
2863 }
2864
2865 void
2866 vec4_visitor::visit(ir_emit_vertex *)
2867 {
2868 unreachable("not reached");
2869 }
2870
2871 void
2872 vec4_visitor::visit(ir_end_primitive *)
2873 {
2874 unreachable("not reached");
2875 }
2876
2877 void
2878 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2879 dst_reg dst, src_reg offset,
2880 src_reg src0, src_reg src1)
2881 {
2882 unsigned mlen = 0;
2883
2884 /* Set the atomic operation offset. */
2885 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2886 mlen++;
2887
2888 /* Set the atomic operation arguments. */
2889 if (src0.file != BAD_FILE) {
2890 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2891 mlen++;
2892 }
2893
2894 if (src1.file != BAD_FILE) {
2895 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2896 mlen++;
2897 }
2898
2899 /* Emit the instruction. Note that this maps to the normal SIMD8
2900 * untyped atomic message on Ivy Bridge, but that's OK because
2901 * unused channels will be masked out.
2902 */
2903 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2904 src_reg(atomic_op), src_reg(surf_index));
2905 inst->base_mrf = 0;
2906 inst->mlen = mlen;
2907 }
2908
2909 void
2910 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2911 src_reg offset)
2912 {
2913 /* Set the surface read offset. */
2914 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2915
2916 /* Emit the instruction. Note that this maps to the normal SIMD8
2917 * untyped surface read message, but that's OK because unused
2918 * channels will be masked out.
2919 */
2920 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2921 dst, src_reg(surf_index));
2922 inst->base_mrf = 0;
2923 inst->mlen = 1;
2924 }
2925
2926 void
2927 vec4_visitor::emit_ndc_computation()
2928 {
2929 /* Get the position */
2930 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2931
2932 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2933 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2934 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2935
2936 current_annotation = "NDC";
2937 dst_reg ndc_w = ndc;
2938 ndc_w.writemask = WRITEMASK_W;
2939 src_reg pos_w = pos;
2940 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2941 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2942
2943 dst_reg ndc_xyz = ndc;
2944 ndc_xyz.writemask = WRITEMASK_XYZ;
2945
2946 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2947 }
2948
2949 void
2950 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2951 {
2952 if (brw->gen < 6 &&
2953 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2954 key->userclip_active || brw->has_negative_rhw_bug)) {
2955 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2956 dst_reg header1_w = header1;
2957 header1_w.writemask = WRITEMASK_W;
2958
2959 emit(MOV(header1, 0u));
2960
2961 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2962 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2963
2964 current_annotation = "Point size";
2965 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2966 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2967 }
2968
2969 if (key->userclip_active) {
2970 current_annotation = "Clipping flags";
2971 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2972 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2973
2974 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2975 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2976 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2977
2978 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2979 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2980 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2981 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2982 }
2983
2984 /* i965 clipping workaround:
2985 * 1) Test for -ve rhw
2986 * 2) If set,
2987 * set ndc = (0,0,0,0)
2988 * set ucp[6] = 1
2989 *
2990 * Later, clipping will detect ucp[6] and ensure the primitive is
2991 * clipped against all fixed planes.
2992 */
2993 if (brw->has_negative_rhw_bug) {
2994 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2995 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2996 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2997 vec4_instruction *inst;
2998 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2999 inst->predicate = BRW_PREDICATE_NORMAL;
3000 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3001 inst->predicate = BRW_PREDICATE_NORMAL;
3002 }
3003
3004 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3005 } else if (brw->gen < 6) {
3006 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3007 } else {
3008 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3009 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3010 dst_reg reg_w = reg;
3011 reg_w.writemask = WRITEMASK_W;
3012 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3013 }
3014 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3015 dst_reg reg_y = reg;
3016 reg_y.writemask = WRITEMASK_Y;
3017 reg_y.type = BRW_REGISTER_TYPE_D;
3018 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3019 }
3020 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3021 dst_reg reg_z = reg;
3022 reg_z.writemask = WRITEMASK_Z;
3023 reg_z.type = BRW_REGISTER_TYPE_D;
3024 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3025 }
3026 }
3027 }
3028
3029 void
3030 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3031 {
3032 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3033 *
3034 * "If a linked set of shaders forming the vertex stage contains no
3035 * static write to gl_ClipVertex or gl_ClipDistance, but the
3036 * application has requested clipping against user clip planes through
3037 * the API, then the coordinate written to gl_Position is used for
3038 * comparison against the user clip planes."
3039 *
3040 * This function is only called if the shader didn't write to
3041 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3042 * if the user wrote to it; otherwise we use gl_Position.
3043 */
3044 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3045 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3046 clip_vertex = VARYING_SLOT_POS;
3047 }
3048
3049 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3050 ++i) {
3051 reg.writemask = 1 << i;
3052 emit(DP4(reg,
3053 src_reg(output_reg[clip_vertex]),
3054 src_reg(this->userplane[i + offset])));
3055 }
3056 }
3057
3058 void
3059 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3060 {
3061 assert (varying < VARYING_SLOT_MAX);
3062 reg.type = output_reg[varying].type;
3063 current_annotation = output_reg_annotation[varying];
3064 /* Copy the register, saturating if necessary */
3065 vec4_instruction *inst = emit(MOV(reg,
3066 src_reg(output_reg[varying])));
3067 if ((varying == VARYING_SLOT_COL0 ||
3068 varying == VARYING_SLOT_COL1 ||
3069 varying == VARYING_SLOT_BFC0 ||
3070 varying == VARYING_SLOT_BFC1) &&
3071 key->clamp_vertex_color) {
3072 inst->saturate = true;
3073 }
3074 }
3075
3076 void
3077 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3078 {
3079 reg.type = BRW_REGISTER_TYPE_F;
3080
3081 switch (varying) {
3082 case VARYING_SLOT_PSIZ:
3083 {
3084 /* PSIZ is always in slot 0, and is coupled with other flags. */
3085 current_annotation = "indices, point width, clip flags";
3086 emit_psiz_and_flags(reg);
3087 break;
3088 }
3089 case BRW_VARYING_SLOT_NDC:
3090 current_annotation = "NDC";
3091 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3092 break;
3093 case VARYING_SLOT_POS:
3094 current_annotation = "gl_Position";
3095 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3096 break;
3097 case VARYING_SLOT_EDGE:
3098 /* This is present when doing unfilled polygons. We're supposed to copy
3099 * the edge flag from the user-provided vertex array
3100 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3101 * of that attribute (starts as 1.0f). This is then used in clipping to
3102 * determine which edges should be drawn as wireframe.
3103 */
3104 current_annotation = "edge flag";
3105 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3106 glsl_type::float_type, WRITEMASK_XYZW))));
3107 break;
3108 case BRW_VARYING_SLOT_PAD:
3109 /* No need to write to this slot */
3110 break;
3111 default:
3112 emit_generic_urb_slot(reg, varying);
3113 break;
3114 }
3115 }
3116
3117 static int
3118 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3119 {
3120 if (brw->gen >= 6) {
3121 /* URB data written (does not include the message header reg) must
3122 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3123 * section 5.4.3.2.2: URB_INTERLEAVED.
3124 *
3125 * URB entries are allocated on a multiple of 1024 bits, so an
3126 * extra 128 bits written here to make the end align to 256 is
3127 * no problem.
3128 */
3129 if ((mlen % 2) != 1)
3130 mlen++;
3131 }
3132
3133 return mlen;
3134 }
3135
3136
3137 /**
3138 * Generates the VUE payload plus the necessary URB write instructions to
3139 * output it.
3140 *
3141 * The VUE layout is documented in Volume 2a.
3142 */
3143 void
3144 vec4_visitor::emit_vertex()
3145 {
3146 /* MRF 0 is reserved for the debugger, so start with message header
3147 * in MRF 1.
3148 */
3149 int base_mrf = 1;
3150 int mrf = base_mrf;
3151 /* In the process of generating our URB write message contents, we
3152 * may need to unspill a register or load from an array. Those
3153 * reads would use MRFs 14-15.
3154 */
3155 int max_usable_mrf = 13;
3156
3157 /* The following assertion verifies that max_usable_mrf causes an
3158 * even-numbered amount of URB write data, which will meet gen6's
3159 * requirements for length alignment.
3160 */
3161 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3162
3163 /* First mrf is the g0-based message header containing URB handles and
3164 * such.
3165 */
3166 emit_urb_write_header(mrf++);
3167
3168 if (brw->gen < 6) {
3169 emit_ndc_computation();
3170 }
3171
3172 /* Lower legacy ff and ClipVertex clipping to clip distances */
3173 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3174 current_annotation = "user clip distances";
3175
3176 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3177 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3178
3179 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3180 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3181 }
3182
3183 /* We may need to split this up into several URB writes, so do them in a
3184 * loop.
3185 */
3186 int slot = 0;
3187 bool complete = false;
3188 do {
3189 /* URB offset is in URB row increments, and each of our MRFs is half of
3190 * one of those, since we're doing interleaved writes.
3191 */
3192 int offset = slot / 2;
3193
3194 mrf = base_mrf + 1;
3195 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3196 emit_urb_slot(dst_reg(MRF, mrf++),
3197 prog_data->vue_map.slot_to_varying[slot]);
3198
3199 /* If this was max_usable_mrf, we can't fit anything more into this
3200 * URB WRITE.
3201 */
3202 if (mrf > max_usable_mrf) {
3203 slot++;
3204 break;
3205 }
3206 }
3207
3208 complete = slot >= prog_data->vue_map.num_slots;
3209 current_annotation = "URB write";
3210 vec4_instruction *inst = emit_urb_write_opcode(complete);
3211 inst->base_mrf = base_mrf;
3212 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3213 inst->offset += offset;
3214 } while(!complete);
3215 }
3216
3217
3218 src_reg
3219 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3220 src_reg *reladdr, int reg_offset)
3221 {
3222 /* Because we store the values to scratch interleaved like our
3223 * vertex data, we need to scale the vec4 index by 2.
3224 */
3225 int message_header_scale = 2;
3226
3227 /* Pre-gen6, the message header uses byte offsets instead of vec4
3228 * (16-byte) offset units.
3229 */
3230 if (brw->gen < 6)
3231 message_header_scale *= 16;
3232
3233 if (reladdr) {
3234 src_reg index = src_reg(this, glsl_type::int_type);
3235
3236 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3237 src_reg(reg_offset)));
3238 emit_before(block, inst, MUL(dst_reg(index), index,
3239 src_reg(message_header_scale)));
3240
3241 return index;
3242 } else {
3243 return src_reg(reg_offset * message_header_scale);
3244 }
3245 }
3246
3247 src_reg
3248 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3249 src_reg *reladdr, int reg_offset)
3250 {
3251 if (reladdr) {
3252 src_reg index = src_reg(this, glsl_type::int_type);
3253
3254 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3255 src_reg(reg_offset)));
3256
3257 /* Pre-gen6, the message header uses byte offsets instead of vec4
3258 * (16-byte) offset units.
3259 */
3260 if (brw->gen < 6) {
3261 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3262 }
3263
3264 return index;
3265 } else if (brw->gen >= 8) {
3266 /* Store the offset in a GRF so we can send-from-GRF. */
3267 src_reg offset = src_reg(this, glsl_type::int_type);
3268 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3269 return offset;
3270 } else {
3271 int message_header_scale = brw->gen < 6 ? 16 : 1;
3272 return src_reg(reg_offset * message_header_scale);
3273 }
3274 }
3275
3276 /**
3277 * Emits an instruction before @inst to load the value named by @orig_src
3278 * from scratch space at @base_offset to @temp.
3279 *
3280 * @base_offset is measured in 32-byte units (the size of a register).
3281 */
3282 void
3283 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3284 dst_reg temp, src_reg orig_src,
3285 int base_offset)
3286 {
3287 int reg_offset = base_offset + orig_src.reg_offset;
3288 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3289 reg_offset);
3290
3291 emit_before(block, inst, SCRATCH_READ(temp, index));
3292 }
3293
3294 /**
3295 * Emits an instruction after @inst to store the value to be written
3296 * to @orig_dst to scratch space at @base_offset, from @temp.
3297 *
3298 * @base_offset is measured in 32-byte units (the size of a register).
3299 */
3300 void
3301 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3302 int base_offset)
3303 {
3304 int reg_offset = base_offset + inst->dst.reg_offset;
3305 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3306 reg_offset);
3307
3308 /* Create a temporary register to store *inst's result in.
3309 *
3310 * We have to be careful in MOVing from our temporary result register in
3311 * the scratch write. If we swizzle from channels of the temporary that
3312 * weren't initialized, it will confuse live interval analysis, which will
3313 * make spilling fail to make progress.
3314 */
3315 src_reg temp = src_reg(this, glsl_type::vec4_type);
3316 temp.type = inst->dst.type;
3317 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3318 int swizzles[4];
3319 for (int i = 0; i < 4; i++)
3320 if (inst->dst.writemask & (1 << i))
3321 swizzles[i] = i;
3322 else
3323 swizzles[i] = first_writemask_chan;
3324 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3325 swizzles[2], swizzles[3]);
3326
3327 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3328 inst->dst.writemask));
3329 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3330 write->predicate = inst->predicate;
3331 write->ir = inst->ir;
3332 write->annotation = inst->annotation;
3333 inst->insert_after(block, write);
3334
3335 inst->dst.file = temp.file;
3336 inst->dst.reg = temp.reg;
3337 inst->dst.reg_offset = temp.reg_offset;
3338 inst->dst.reladdr = NULL;
3339 }
3340
3341 /**
3342 * We can't generally support array access in GRF space, because a
3343 * single instruction's destination can only span 2 contiguous
3344 * registers. So, we send all GRF arrays that get variable index
3345 * access to scratch space.
3346 */
3347 void
3348 vec4_visitor::move_grf_array_access_to_scratch()
3349 {
3350 int scratch_loc[this->virtual_grf_count];
3351 memset(scratch_loc, -1, sizeof(scratch_loc));
3352
3353 /* First, calculate the set of virtual GRFs that need to be punted
3354 * to scratch due to having any array access on them, and where in
3355 * scratch.
3356 */
3357 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3358 if (inst->dst.file == GRF && inst->dst.reladdr &&
3359 scratch_loc[inst->dst.reg] == -1) {
3360 scratch_loc[inst->dst.reg] = c->last_scratch;
3361 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3362 }
3363
3364 for (int i = 0 ; i < 3; i++) {
3365 src_reg *src = &inst->src[i];
3366
3367 if (src->file == GRF && src->reladdr &&
3368 scratch_loc[src->reg] == -1) {
3369 scratch_loc[src->reg] = c->last_scratch;
3370 c->last_scratch += this->virtual_grf_sizes[src->reg];
3371 }
3372 }
3373 }
3374
3375 /* Now, for anything that will be accessed through scratch, rewrite
3376 * it to load/store. Note that this is a _safe list walk, because
3377 * we may generate a new scratch_write instruction after the one
3378 * we're processing.
3379 */
3380 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3381 /* Set up the annotation tracking for new generated instructions. */
3382 base_ir = inst->ir;
3383 current_annotation = inst->annotation;
3384
3385 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3386 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3387 }
3388
3389 for (int i = 0 ; i < 3; i++) {
3390 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3391 continue;
3392
3393 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3394
3395 emit_scratch_read(block, inst, temp, inst->src[i],
3396 scratch_loc[inst->src[i].reg]);
3397
3398 inst->src[i].file = temp.file;
3399 inst->src[i].reg = temp.reg;
3400 inst->src[i].reg_offset = temp.reg_offset;
3401 inst->src[i].reladdr = NULL;
3402 }
3403 }
3404 }
3405
3406 /**
3407 * Emits an instruction before @inst to load the value named by @orig_src
3408 * from the pull constant buffer (surface) at @base_offset to @temp.
3409 */
3410 void
3411 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3412 dst_reg temp, src_reg orig_src,
3413 int base_offset)
3414 {
3415 int reg_offset = base_offset + orig_src.reg_offset;
3416 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3417 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3418 reg_offset);
3419 vec4_instruction *load;
3420
3421 if (brw->gen >= 7) {
3422 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3423 grf_offset.type = offset.type;
3424 emit_before(block, inst, MOV(grf_offset, offset));
3425
3426 load = new(mem_ctx) vec4_instruction(this,
3427 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3428 temp, index, src_reg(grf_offset));
3429 } else {
3430 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3431 temp, index, offset);
3432 load->base_mrf = 14;
3433 load->mlen = 1;
3434 }
3435 emit_before(block, inst, load);
3436 }
3437
3438 /**
3439 * Implements array access of uniforms by inserting a
3440 * PULL_CONSTANT_LOAD instruction.
3441 *
3442 * Unlike temporary GRF array access (where we don't support it due to
3443 * the difficulty of doing relative addressing on instruction
3444 * destinations), we could potentially do array access of uniforms
3445 * that were loaded in GRF space as push constants. In real-world
3446 * usage we've seen, though, the arrays being used are always larger
3447 * than we could load as push constants, so just always move all
3448 * uniform array access out to a pull constant buffer.
3449 */
3450 void
3451 vec4_visitor::move_uniform_array_access_to_pull_constants()
3452 {
3453 int pull_constant_loc[this->uniforms];
3454 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3455 bool nested_reladdr;
3456
3457 /* Walk through and find array access of uniforms. Put a copy of that
3458 * uniform in the pull constant buffer.
3459 *
3460 * Note that we don't move constant-indexed accesses to arrays. No
3461 * testing has been done of the performance impact of this choice.
3462 */
3463 do {
3464 nested_reladdr = false;
3465
3466 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3467 for (int i = 0 ; i < 3; i++) {
3468 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3469 continue;
3470
3471 int uniform = inst->src[i].reg;
3472
3473 if (inst->src[i].reladdr->reladdr)
3474 nested_reladdr = true; /* will need another pass */
3475
3476 /* If this array isn't already present in the pull constant buffer,
3477 * add it.
3478 */
3479 if (pull_constant_loc[uniform] == -1) {
3480 const gl_constant_value **values =
3481 &stage_prog_data->param[uniform * 4];
3482
3483 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3484
3485 assert(uniform < uniform_array_size);
3486 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3487 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3488 = values[j];
3489 }
3490 }
3491
3492 /* Set up the annotation tracking for new generated instructions. */
3493 base_ir = inst->ir;
3494 current_annotation = inst->annotation;
3495
3496 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3497
3498 emit_pull_constant_load(block, inst, temp, inst->src[i],
3499 pull_constant_loc[uniform]);
3500
3501 inst->src[i].file = temp.file;
3502 inst->src[i].reg = temp.reg;
3503 inst->src[i].reg_offset = temp.reg_offset;
3504 inst->src[i].reladdr = NULL;
3505 }
3506 }
3507 } while (nested_reladdr);
3508
3509 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3510 * no need to track them as larger-than-vec4 objects. This will be
3511 * relied on in cutting out unused uniform vectors from push
3512 * constants.
3513 */
3514 split_uniform_registers();
3515 }
3516
3517 void
3518 vec4_visitor::resolve_ud_negate(src_reg *reg)
3519 {
3520 if (reg->type != BRW_REGISTER_TYPE_UD ||
3521 !reg->negate)
3522 return;
3523
3524 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3525 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3526 *reg = temp;
3527 }
3528
3529 vec4_visitor::vec4_visitor(struct brw_context *brw,
3530 struct brw_vec4_compile *c,
3531 struct gl_program *prog,
3532 const struct brw_vec4_prog_key *key,
3533 struct brw_vec4_prog_data *prog_data,
3534 struct gl_shader_program *shader_prog,
3535 gl_shader_stage stage,
3536 void *mem_ctx,
3537 bool debug_flag,
3538 bool no_spills,
3539 shader_time_shader_type st_base,
3540 shader_time_shader_type st_written,
3541 shader_time_shader_type st_reset)
3542 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3543 c(c),
3544 key(key),
3545 prog_data(prog_data),
3546 sanity_param_count(0),
3547 fail_msg(NULL),
3548 first_non_payload_grf(0),
3549 need_all_constants_in_pull_buffer(false),
3550 debug_flag(debug_flag),
3551 no_spills(no_spills),
3552 st_base(st_base),
3553 st_written(st_written),
3554 st_reset(st_reset)
3555 {
3556 this->mem_ctx = mem_ctx;
3557 this->failed = false;
3558
3559 this->base_ir = NULL;
3560 this->current_annotation = NULL;
3561 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3562
3563 this->variable_ht = hash_table_ctor(0,
3564 hash_table_pointer_hash,
3565 hash_table_pointer_compare);
3566
3567 this->virtual_grf_start = NULL;
3568 this->virtual_grf_end = NULL;
3569 this->virtual_grf_sizes = NULL;
3570 this->virtual_grf_count = 0;
3571 this->virtual_grf_reg_map = NULL;
3572 this->virtual_grf_reg_count = 0;
3573 this->virtual_grf_array_size = 0;
3574 this->live_intervals_valid = false;
3575
3576 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3577
3578 this->uniforms = 0;
3579
3580 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3581 * at least one. See setup_uniforms() in brw_vec4.cpp.
3582 */
3583 this->uniform_array_size = 1;
3584 if (prog_data) {
3585 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3586 }
3587
3588 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3589 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3590 }
3591
3592 vec4_visitor::~vec4_visitor()
3593 {
3594 hash_table_dtor(this->variable_ht);
3595 }
3596
3597
3598 void
3599 vec4_visitor::fail(const char *format, ...)
3600 {
3601 va_list va;
3602 char *msg;
3603
3604 if (failed)
3605 return;
3606
3607 failed = true;
3608
3609 va_start(va, format);
3610 msg = ralloc_vasprintf(mem_ctx, format, va);
3611 va_end(va);
3612 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3613
3614 this->fail_msg = msg;
3615
3616 if (debug_flag) {
3617 fprintf(stderr, "%s", msg);
3618 }
3619 }
3620
3621 } /* namespace brw */