Merge branch 'master' of ../mesa into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->predicate = BRW_PREDICATE_NONE;
49 this->predicate_inverse = false;
50 this->target = 0;
51 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
52 this->shadow_compare = false;
53 this->ir = NULL;
54 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
55 this->header_size = 0;
56 this->flag_subreg = 0;
57 this->mlen = 0;
58 this->base_mrf = 0;
59 this->offset = 0;
60 this->annotation = NULL;
61 }
62
63 vec4_instruction *
64 vec4_visitor::emit(vec4_instruction *inst)
65 {
66 inst->ir = this->base_ir;
67 inst->annotation = this->current_annotation;
68
69 this->instructions.push_tail(inst);
70
71 return inst;
72 }
73
74 vec4_instruction *
75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
76 vec4_instruction *new_inst)
77 {
78 new_inst->ir = inst->ir;
79 new_inst->annotation = inst->annotation;
80
81 inst->insert_before(block, new_inst);
82
83 return inst;
84 }
85
86 vec4_instruction *
87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
88 const src_reg &src1, const src_reg &src2)
89 {
90 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
91 }
92
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
96 const src_reg &src1)
97 {
98 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
103 {
104 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
109 {
110 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
111 }
112
113 vec4_instruction *
114 vec4_visitor::emit(enum opcode opcode)
115 {
116 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
117 }
118
119 #define ALU1(op) \
120 vec4_instruction * \
121 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
122 { \
123 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
124 }
125
126 #define ALU2(op) \
127 vec4_instruction * \
128 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
129 const src_reg &src1) \
130 { \
131 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
132 src0, src1); \
133 }
134
135 #define ALU2_ACC(op) \
136 vec4_instruction * \
137 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
138 const src_reg &src1) \
139 { \
140 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
141 BRW_OPCODE_##op, dst, src0, src1); \
142 inst->writes_accumulator = true; \
143 return inst; \
144 }
145
146 #define ALU3(op) \
147 vec4_instruction * \
148 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
149 const src_reg &src1, const src_reg &src2) \
150 { \
151 assert(devinfo->gen >= 6); \
152 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
153 src0, src1, src2); \
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU1(F32TO16)
163 ALU1(F16TO32)
164 ALU2(ADD)
165 ALU2(MUL)
166 ALU2_ACC(MACH)
167 ALU2(AND)
168 ALU2(OR)
169 ALU2(XOR)
170 ALU2(DP3)
171 ALU2(DP4)
172 ALU2(DPH)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(MAC)
188
189 /** Gen4 predicated IF. */
190 vec4_instruction *
191 vec4_visitor::IF(enum brw_predicate predicate)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
196 inst->predicate = predicate;
197
198 return inst;
199 }
200
201 /** Gen6 IF with embedded comparison. */
202 vec4_instruction *
203 vec4_visitor::IF(src_reg src0, src_reg src1,
204 enum brw_conditional_mod condition)
205 {
206 assert(devinfo->gen == 6);
207
208 vec4_instruction *inst;
209
210 resolve_ud_negate(&src0);
211 resolve_ud_negate(&src1);
212
213 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
214 src0, src1);
215 inst->conditional_mod = condition;
216
217 return inst;
218 }
219
220 /**
221 * CMP: Sets the low bit of the destination channels with the result
222 * of the comparison, while the upper bits are undefined, and updates
223 * the flag register with the packed 16 bits of the result.
224 */
225 vec4_instruction *
226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
227 enum brw_conditional_mod condition)
228 {
229 vec4_instruction *inst;
230
231 /* Take the instruction:
232 *
233 * CMP null<d> src0<f> src1<f>
234 *
235 * Original gen4 does type conversion to the destination type before
236 * comparison, producing garbage results for floating point comparisons.
237 *
238 * The destination type doesn't matter on newer generations, so we set the
239 * type to match src0 so we can compact the instruction.
240 */
241 dst.type = src0.type;
242 if (dst.file == HW_REG)
243 dst.fixed_hw_reg.type = dst.type;
244
245 resolve_ud_negate(&src0);
246 resolve_ud_negate(&src1);
247
248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249 inst->conditional_mod = condition;
250
251 return inst;
252 }
253
254 vec4_instruction *
255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257 vec4_instruction *inst;
258
259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 dst, index);
261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262 inst->mlen = 2;
263
264 return inst;
265 }
266
267 vec4_instruction *
268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269 const src_reg &index)
270 {
271 vec4_instruction *inst;
272
273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 dst, src, index);
275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276 inst->mlen = 3;
277
278 return inst;
279 }
280
281 void
282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
283 {
284 static enum opcode dot_opcodes[] = {
285 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
286 };
287
288 emit(dot_opcodes[elements - 2], dst, src0, src1);
289 }
290
291 src_reg
292 vec4_visitor::fix_3src_operand(const src_reg &src)
293 {
294 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
295 * able to use vertical stride of zero to replicate the vec4 uniform, like
296 *
297 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
298 *
299 * But you can't, since vertical stride is always four in three-source
300 * instructions. Instead, insert a MOV instruction to do the replication so
301 * that the three-source instruction can consume it.
302 */
303
304 /* The MOV is only needed if the source is a uniform or immediate. */
305 if (src.file != UNIFORM && src.file != IMM)
306 return src;
307
308 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
309 return src;
310
311 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
312 expanded.type = src.type;
313 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
314 return src_reg(expanded);
315 }
316
317 src_reg
318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
319 {
320 if (!src.abs && !src.negate)
321 return src;
322
323 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
324 resolved.type = src.type;
325 emit(MOV(resolved, src));
326
327 return src_reg(resolved);
328 }
329
330 src_reg
331 vec4_visitor::fix_math_operand(const src_reg &src)
332 {
333 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
334 return src;
335
336 /* The gen6 math instruction ignores the source modifiers --
337 * swizzle, abs, negate, and at least some parts of the register
338 * region description.
339 *
340 * Rather than trying to enumerate all these cases, *always* expand the
341 * operand to a temp GRF for gen6.
342 *
343 * For gen7, keep the operand as-is, except if immediate, which gen7 still
344 * can't use.
345 */
346
347 if (devinfo->gen == 7 && src.file != IMM)
348 return src;
349
350 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
351 expanded.type = src.type;
352 emit(MOV(expanded, src));
353 return src_reg(expanded);
354 }
355
356 vec4_instruction *
357 vec4_visitor::emit_math(enum opcode opcode,
358 const dst_reg &dst,
359 const src_reg &src0, const src_reg &src1)
360 {
361 vec4_instruction *math =
362 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
363
364 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
365 /* MATH on Gen6 must be align1, so we can't do writemasks. */
366 math->dst = dst_reg(this, glsl_type::vec4_type);
367 math->dst.type = dst.type;
368 math = emit(MOV(dst, src_reg(math->dst)));
369 } else if (devinfo->gen < 6) {
370 math->base_mrf = 1;
371 math->mlen = src1.file == BAD_FILE ? 1 : 2;
372 }
373
374 return math;
375 }
376
377 void
378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
379 {
380 if (devinfo->gen < 7) {
381 unreachable("ir_unop_pack_half_2x16 should be lowered");
382 }
383
384 assert(dst.type == BRW_REGISTER_TYPE_UD);
385 assert(src0.type == BRW_REGISTER_TYPE_F);
386
387 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
388 *
389 * Because this instruction does not have a 16-bit floating-point type,
390 * the destination data type must be Word (W).
391 *
392 * The destination must be DWord-aligned and specify a horizontal stride
393 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
394 * each destination channel and the upper word is not modified.
395 *
396 * The above restriction implies that the f32to16 instruction must use
397 * align1 mode, because only in align1 mode is it possible to specify
398 * horizontal stride. We choose here to defy the hardware docs and emit
399 * align16 instructions.
400 *
401 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
402 * instructions. I was partially successful in that the code passed all
403 * tests. However, the code was dubiously correct and fragile, and the
404 * tests were not harsh enough to probe that frailty. Not trusting the
405 * code, I chose instead to remain in align16 mode in defiance of the hw
406 * docs).
407 *
408 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
409 * simulator, emitting a f32to16 in align16 mode with UD as destination
410 * data type is safe. The behavior differs from that specified in the PRM
411 * in that the upper word of each destination channel is cleared to 0.
412 */
413
414 dst_reg tmp_dst(this, glsl_type::uvec2_type);
415 src_reg tmp_src(tmp_dst);
416
417 #if 0
418 /* Verify the undocumented behavior on which the following instructions
419 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
420 * then the result of the bit-or instruction below will be incorrect.
421 *
422 * You should inspect the disasm output in order to verify that the MOV is
423 * not optimized away.
424 */
425 emit(MOV(tmp_dst, src_reg(0x12345678u)));
426 #endif
427
428 /* Give tmp the form below, where "." means untouched.
429 *
430 * w z y x w z y x
431 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
432 *
433 * That the upper word of each write-channel be 0 is required for the
434 * following bit-shift and bit-or instructions to work. Note that this
435 * relies on the undocumented hardware behavior mentioned above.
436 */
437 tmp_dst.writemask = WRITEMASK_XY;
438 emit(F32TO16(tmp_dst, src0));
439
440 /* Give the write-channels of dst the form:
441 * 0xhhhh0000
442 */
443 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
444 emit(SHL(dst, tmp_src, src_reg(16u)));
445
446 /* Finally, give the write-channels of dst the form of packHalf2x16's
447 * output:
448 * 0xhhhhllll
449 */
450 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
451 emit(OR(dst, src_reg(dst), tmp_src));
452 }
453
454 void
455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
456 {
457 if (devinfo->gen < 7) {
458 unreachable("ir_unop_unpack_half_2x16 should be lowered");
459 }
460
461 assert(dst.type == BRW_REGISTER_TYPE_F);
462 assert(src0.type == BRW_REGISTER_TYPE_UD);
463
464 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
465 *
466 * Because this instruction does not have a 16-bit floating-point type,
467 * the source data type must be Word (W). The destination type must be
468 * F (Float).
469 *
470 * To use W as the source data type, we must adjust horizontal strides,
471 * which is only possible in align1 mode. All my [chadv] attempts at
472 * emitting align1 instructions for unpackHalf2x16 failed to pass the
473 * Piglit tests, so I gave up.
474 *
475 * I've verified that, on gen7 hardware and the simulator, it is safe to
476 * emit f16to32 in align16 mode with UD as source data type.
477 */
478
479 dst_reg tmp_dst(this, glsl_type::uvec2_type);
480 src_reg tmp_src(tmp_dst);
481
482 tmp_dst.writemask = WRITEMASK_X;
483 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
484
485 tmp_dst.writemask = WRITEMASK_Y;
486 emit(SHR(tmp_dst, src0, src_reg(16u)));
487
488 dst.writemask = WRITEMASK_XY;
489 emit(F16TO32(dst, tmp_src));
490 }
491
492 void
493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497 * is not suitable to generate the shift values, but we can use the packed
498 * vector float and a type-converting MOV.
499 */
500 dst_reg shift(this, glsl_type::uvec4_type);
501 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
502
503 dst_reg shifted(this, glsl_type::uvec4_type);
504 src0.swizzle = BRW_SWIZZLE_XXXX;
505 emit(SHR(shifted, src0, src_reg(shift)));
506
507 shifted.type = BRW_REGISTER_TYPE_UB;
508 dst_reg f(this, glsl_type::vec4_type);
509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510
511 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
512 }
513
514 void
515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
516 {
517 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
518 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
519 * is not suitable to generate the shift values, but we can use the packed
520 * vector float and a type-converting MOV.
521 */
522 dst_reg shift(this, glsl_type::uvec4_type);
523 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
524
525 dst_reg shifted(this, glsl_type::uvec4_type);
526 src0.swizzle = BRW_SWIZZLE_XXXX;
527 emit(SHR(shifted, src0, src_reg(shift)));
528
529 shifted.type = BRW_REGISTER_TYPE_B;
530 dst_reg f(this, glsl_type::vec4_type);
531 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
532
533 dst_reg scaled(this, glsl_type::vec4_type);
534 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
535
536 dst_reg max(this, glsl_type::vec4_type);
537 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
538 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
539 }
540
541 void
542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
543 {
544 dst_reg saturated(this, glsl_type::vec4_type);
545 vec4_instruction *inst = emit(MOV(saturated, src0));
546 inst->saturate = true;
547
548 dst_reg scaled(this, glsl_type::vec4_type);
549 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
550
551 dst_reg rounded(this, glsl_type::vec4_type);
552 emit(RNDE(rounded, src_reg(scaled)));
553
554 dst_reg u(this, glsl_type::uvec4_type);
555 emit(MOV(u, src_reg(rounded)));
556
557 src_reg bytes(u);
558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560
561 void
562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
563 {
564 dst_reg max(this, glsl_type::vec4_type);
565 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
566
567 dst_reg min(this, glsl_type::vec4_type);
568 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
569
570 dst_reg scaled(this, glsl_type::vec4_type);
571 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
572
573 dst_reg rounded(this, glsl_type::vec4_type);
574 emit(RNDE(rounded, src_reg(scaled)));
575
576 dst_reg i(this, glsl_type::ivec4_type);
577 emit(MOV(i, src_reg(rounded)));
578
579 src_reg bytes(i);
580 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
581 }
582
583 void
584 vec4_visitor::visit_instructions(const exec_list *list)
585 {
586 foreach_in_list(ir_instruction, ir, list) {
587 base_ir = ir;
588 ir->accept(this);
589 }
590 }
591
592 /**
593 * Returns the minimum number of vec4 elements needed to pack a type.
594 *
595 * For simple types, it will return 1 (a single vec4); for matrices, the
596 * number of columns; for array and struct, the sum of the vec4_size of
597 * each of its elements; and for sampler and atomic, zero.
598 *
599 * This method is useful to calculate how much register space is needed to
600 * store a particular type.
601 */
602 extern "C" int
603 type_size_vec4(const struct glsl_type *type)
604 {
605 unsigned int i;
606 int size;
607
608 switch (type->base_type) {
609 case GLSL_TYPE_UINT:
610 case GLSL_TYPE_INT:
611 case GLSL_TYPE_FLOAT:
612 case GLSL_TYPE_BOOL:
613 if (type->is_matrix()) {
614 return type->matrix_columns;
615 } else {
616 /* Regardless of size of vector, it gets a vec4. This is bad
617 * packing for things like floats, but otherwise arrays become a
618 * mess. Hopefully a later pass over the code can pack scalars
619 * down if appropriate.
620 */
621 return 1;
622 }
623 case GLSL_TYPE_ARRAY:
624 assert(type->length > 0);
625 return type_size_vec4(type->fields.array) * type->length;
626 case GLSL_TYPE_STRUCT:
627 size = 0;
628 for (i = 0; i < type->length; i++) {
629 size += type_size_vec4(type->fields.structure[i].type);
630 }
631 return size;
632 case GLSL_TYPE_SUBROUTINE:
633 return 1;
634
635 case GLSL_TYPE_SAMPLER:
636 /* Samplers take up no register space, since they're baked in at
637 * link time.
638 */
639 return 0;
640 case GLSL_TYPE_ATOMIC_UINT:
641 return 0;
642 case GLSL_TYPE_IMAGE:
643 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
644 case GLSL_TYPE_VOID:
645 case GLSL_TYPE_DOUBLE:
646 case GLSL_TYPE_ERROR:
647 case GLSL_TYPE_INTERFACE:
648 case GLSL_TYPE_FUNCTION:
649 unreachable("not reached");
650 }
651
652 return 0;
653 }
654
655 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
656 {
657 init();
658
659 this->file = GRF;
660 this->reg = v->alloc.allocate(type_size_vec4(type));
661
662 if (type->is_array() || type->is_record()) {
663 this->swizzle = BRW_SWIZZLE_NOOP;
664 } else {
665 this->swizzle = brw_swizzle_for_size(type->vector_elements);
666 }
667
668 this->type = brw_type_for_base_type(type);
669 }
670
671 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
672 {
673 assert(size > 0);
674
675 init();
676
677 this->file = GRF;
678 this->reg = v->alloc.allocate(type_size_vec4(type) * size);
679
680 this->swizzle = BRW_SWIZZLE_NOOP;
681
682 this->type = brw_type_for_base_type(type);
683 }
684
685 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
686 {
687 init();
688
689 this->file = GRF;
690 this->reg = v->alloc.allocate(type_size_vec4(type));
691
692 if (type->is_array() || type->is_record()) {
693 this->writemask = WRITEMASK_XYZW;
694 } else {
695 this->writemask = (1 << type->vector_elements) - 1;
696 }
697
698 this->type = brw_type_for_base_type(type);
699 }
700
701 void
702 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
703 const gl_constant_value *values,
704 unsigned n)
705 {
706 static const gl_constant_value zero = { 0 };
707
708 assert(param_offset % 4 == 0);
709
710 for (unsigned i = 0; i < n; ++i)
711 stage_prog_data->param[param_offset + i] = &values[i];
712
713 for (unsigned i = n; i < 4; ++i)
714 stage_prog_data->param[param_offset + i] = &zero;
715
716 uniform_vector_size[param_offset / 4] = n;
717 }
718
719 /* Our support for uniforms is piggy-backed on the struct
720 * gl_fragment_program, because that's where the values actually
721 * get stored, rather than in some global gl_shader_program uniform
722 * store.
723 */
724 void
725 vec4_visitor::setup_uniform_values(ir_variable *ir)
726 {
727 int namelen = strlen(ir->name);
728
729 /* The data for our (non-builtin) uniforms is stored in a series of
730 * gl_uniform_driver_storage structs for each subcomponent that
731 * glGetUniformLocation() could name. We know it's been set up in the same
732 * order we'd walk the type, so walk the list of storage and find anything
733 * with our name, or the prefix of a component that starts with our name.
734 */
735 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
736 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
737
738 if (storage->builtin)
739 continue;
740
741 if (strncmp(ir->name, storage->name, namelen) != 0 ||
742 (storage->name[namelen] != 0 &&
743 storage->name[namelen] != '.' &&
744 storage->name[namelen] != '[')) {
745 continue;
746 }
747
748 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
749 storage->type->matrix_columns);
750 const unsigned vector_size = storage->type->vector_elements;
751
752 for (unsigned s = 0; s < vector_count; s++) {
753 setup_vec4_uniform_value(uniforms * 4,
754 &storage->storage[s * vector_size],
755 vector_size);
756 uniforms++;
757 }
758 }
759 }
760
761 /* Our support for builtin uniforms is even scarier than non-builtin.
762 * It sits on top of the PROG_STATE_VAR parameters that are
763 * automatically updated from GL context state.
764 */
765 void
766 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
767 {
768 const ir_state_slot *const slots = ir->get_state_slots();
769 assert(slots != NULL);
770
771 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
772 /* This state reference has already been setup by ir_to_mesa,
773 * but we'll get the same index back here. We can reference
774 * ParameterValues directly, since unlike brw_fs.cpp, we never
775 * add new state references during compile.
776 */
777 int index = _mesa_add_state_reference(this->prog->Parameters,
778 (gl_state_index *)slots[i].tokens);
779 gl_constant_value *values =
780 &this->prog->Parameters->ParameterValues[index][0];
781
782 assert(this->uniforms < uniform_array_size);
783
784 for (unsigned j = 0; j < 4; j++)
785 stage_prog_data->param[this->uniforms * 4 + j] =
786 &values[GET_SWZ(slots[i].swizzle, j)];
787
788 this->uniform_vector_size[this->uniforms] =
789 (ir->type->is_scalar() || ir->type->is_vector() ||
790 ir->type->is_matrix() ? ir->type->vector_elements : 4);
791
792 this->uniforms++;
793 }
794 }
795
796 dst_reg *
797 vec4_visitor::variable_storage(ir_variable *var)
798 {
799 return (dst_reg *)hash_table_find(this->variable_ht, var);
800 }
801
802 void
803 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
804 enum brw_predicate *predicate)
805 {
806 ir_expression *expr = ir->as_expression();
807
808 *predicate = BRW_PREDICATE_NORMAL;
809
810 if (expr && expr->operation != ir_binop_ubo_load) {
811 src_reg op[3];
812 vec4_instruction *inst;
813
814 assert(expr->get_num_operands() <= 3);
815 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
816 expr->operands[i]->accept(this);
817 op[i] = this->result;
818
819 resolve_ud_negate(&op[i]);
820 }
821
822 switch (expr->operation) {
823 case ir_unop_logic_not:
824 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
825 inst->conditional_mod = BRW_CONDITIONAL_Z;
826 break;
827
828 case ir_binop_logic_xor:
829 if (devinfo->gen <= 5) {
830 src_reg temp = src_reg(this, ir->type);
831 emit(XOR(dst_reg(temp), op[0], op[1]));
832 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
833 } else {
834 inst = emit(XOR(dst_null_d(), op[0], op[1]));
835 }
836 inst->conditional_mod = BRW_CONDITIONAL_NZ;
837 break;
838
839 case ir_binop_logic_or:
840 if (devinfo->gen <= 5) {
841 src_reg temp = src_reg(this, ir->type);
842 emit(OR(dst_reg(temp), op[0], op[1]));
843 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
844 } else {
845 inst = emit(OR(dst_null_d(), op[0], op[1]));
846 }
847 inst->conditional_mod = BRW_CONDITIONAL_NZ;
848 break;
849
850 case ir_binop_logic_and:
851 if (devinfo->gen <= 5) {
852 src_reg temp = src_reg(this, ir->type);
853 emit(AND(dst_reg(temp), op[0], op[1]));
854 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
855 } else {
856 inst = emit(AND(dst_null_d(), op[0], op[1]));
857 }
858 inst->conditional_mod = BRW_CONDITIONAL_NZ;
859 break;
860
861 case ir_unop_f2b:
862 if (devinfo->gen >= 6) {
863 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
864 } else {
865 inst = emit(MOV(dst_null_f(), op[0]));
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 }
868 break;
869
870 case ir_unop_i2b:
871 if (devinfo->gen >= 6) {
872 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
873 } else {
874 inst = emit(MOV(dst_null_d(), op[0]));
875 inst->conditional_mod = BRW_CONDITIONAL_NZ;
876 }
877 break;
878
879 case ir_binop_all_equal:
880 if (devinfo->gen <= 5) {
881 resolve_bool_comparison(expr->operands[0], &op[0]);
882 resolve_bool_comparison(expr->operands[1], &op[1]);
883 }
884 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
885 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
886 break;
887
888 case ir_binop_any_nequal:
889 if (devinfo->gen <= 5) {
890 resolve_bool_comparison(expr->operands[0], &op[0]);
891 resolve_bool_comparison(expr->operands[1], &op[1]);
892 }
893 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
894 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
895 break;
896
897 case ir_unop_any:
898 if (devinfo->gen <= 5) {
899 resolve_bool_comparison(expr->operands[0], &op[0]);
900 }
901 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
902 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
903 break;
904
905 case ir_binop_greater:
906 case ir_binop_gequal:
907 case ir_binop_less:
908 case ir_binop_lequal:
909 case ir_binop_equal:
910 case ir_binop_nequal:
911 if (devinfo->gen <= 5) {
912 resolve_bool_comparison(expr->operands[0], &op[0]);
913 resolve_bool_comparison(expr->operands[1], &op[1]);
914 }
915 emit(CMP(dst_null_d(), op[0], op[1],
916 brw_conditional_for_comparison(expr->operation)));
917 break;
918
919 case ir_triop_csel: {
920 /* Expand the boolean condition into the flag register. */
921 inst = emit(MOV(dst_null_d(), op[0]));
922 inst->conditional_mod = BRW_CONDITIONAL_NZ;
923
924 /* Select which boolean to return. */
925 dst_reg temp(this, expr->operands[1]->type);
926 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
927 inst->predicate = BRW_PREDICATE_NORMAL;
928
929 /* Expand the result to a condition code. */
930 inst = emit(MOV(dst_null_d(), src_reg(temp)));
931 inst->conditional_mod = BRW_CONDITIONAL_NZ;
932 break;
933 }
934
935 default:
936 unreachable("not reached");
937 }
938 return;
939 }
940
941 ir->accept(this);
942
943 resolve_ud_negate(&this->result);
944
945 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
946 inst->conditional_mod = BRW_CONDITIONAL_NZ;
947 }
948
949 /**
950 * Emit a gen6 IF statement with the comparison folded into the IF
951 * instruction.
952 */
953 void
954 vec4_visitor::emit_if_gen6(ir_if *ir)
955 {
956 ir_expression *expr = ir->condition->as_expression();
957
958 if (expr && expr->operation != ir_binop_ubo_load) {
959 src_reg op[3];
960 dst_reg temp;
961
962 assert(expr->get_num_operands() <= 3);
963 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
964 expr->operands[i]->accept(this);
965 op[i] = this->result;
966 }
967
968 switch (expr->operation) {
969 case ir_unop_logic_not:
970 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
971 return;
972
973 case ir_binop_logic_xor:
974 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
975 return;
976
977 case ir_binop_logic_or:
978 temp = dst_reg(this, glsl_type::bool_type);
979 emit(OR(temp, op[0], op[1]));
980 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
981 return;
982
983 case ir_binop_logic_and:
984 temp = dst_reg(this, glsl_type::bool_type);
985 emit(AND(temp, op[0], op[1]));
986 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
987 return;
988
989 case ir_unop_f2b:
990 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
991 return;
992
993 case ir_unop_i2b:
994 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
995 return;
996
997 case ir_binop_greater:
998 case ir_binop_gequal:
999 case ir_binop_less:
1000 case ir_binop_lequal:
1001 case ir_binop_equal:
1002 case ir_binop_nequal:
1003 emit(IF(op[0], op[1],
1004 brw_conditional_for_comparison(expr->operation)));
1005 return;
1006
1007 case ir_binop_all_equal:
1008 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1009 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1010 return;
1011
1012 case ir_binop_any_nequal:
1013 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1014 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1015 return;
1016
1017 case ir_unop_any:
1018 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1019 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1020 return;
1021
1022 case ir_triop_csel: {
1023 /* Expand the boolean condition into the flag register. */
1024 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1025 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1026
1027 /* Select which boolean to return. */
1028 dst_reg temp(this, expr->operands[1]->type);
1029 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1030 inst->predicate = BRW_PREDICATE_NORMAL;
1031
1032 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1033 return;
1034 }
1035
1036 default:
1037 unreachable("not reached");
1038 }
1039 return;
1040 }
1041
1042 ir->condition->accept(this);
1043
1044 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1045 }
1046
1047 void
1048 vec4_visitor::visit(ir_variable *ir)
1049 {
1050 dst_reg *reg = NULL;
1051
1052 if (variable_storage(ir))
1053 return;
1054
1055 switch (ir->data.mode) {
1056 case ir_var_shader_in:
1057 assert(ir->data.location != -1);
1058 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1059 break;
1060
1061 case ir_var_shader_out:
1062 assert(ir->data.location != -1);
1063 reg = new(mem_ctx) dst_reg(this, ir->type);
1064
1065 for (int i = 0; i < type_size_vec4(ir->type); i++) {
1066 output_reg[ir->data.location + i] = *reg;
1067 output_reg[ir->data.location + i].reg_offset = i;
1068 output_reg_annotation[ir->data.location + i] = ir->name;
1069 }
1070 break;
1071
1072 case ir_var_auto:
1073 case ir_var_temporary:
1074 reg = new(mem_ctx) dst_reg(this, ir->type);
1075 break;
1076
1077 case ir_var_uniform:
1078 case ir_var_shader_storage:
1079 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1080
1081 /* Thanks to the lower_ubo_reference pass, we will see only
1082 * ir_binop_{ubo,ssbo}_load expressions and not ir_dereference_variable
1083 * for UBO/SSBO variables, so no need for them to be in variable_ht.
1084 *
1085 * Some uniforms, such as samplers and atomic counters, have no actual
1086 * storage, so we should ignore them.
1087 */
1088 if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1089 return;
1090
1091 /* Track how big the whole uniform variable is, in case we need to put a
1092 * copy of its data into pull constants for array access.
1093 */
1094 assert(this->uniforms < uniform_array_size);
1095 this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1096
1097 if (!strncmp(ir->name, "gl_", 3)) {
1098 setup_builtin_uniform_values(ir);
1099 } else {
1100 setup_uniform_values(ir);
1101 }
1102 break;
1103
1104 case ir_var_system_value:
1105 reg = make_reg_for_system_value(ir->data.location, ir->type);
1106 break;
1107
1108 default:
1109 unreachable("not reached");
1110 }
1111
1112 reg->type = brw_type_for_base_type(ir->type);
1113 hash_table_insert(this->variable_ht, reg, ir);
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_loop *ir)
1118 {
1119 /* We don't want debugging output to print the whole body of the
1120 * loop as the annotation.
1121 */
1122 this->base_ir = NULL;
1123
1124 emit(BRW_OPCODE_DO);
1125
1126 visit_instructions(&ir->body_instructions);
1127
1128 emit(BRW_OPCODE_WHILE);
1129 }
1130
1131 void
1132 vec4_visitor::visit(ir_loop_jump *ir)
1133 {
1134 switch (ir->mode) {
1135 case ir_loop_jump::jump_break:
1136 emit(BRW_OPCODE_BREAK);
1137 break;
1138 case ir_loop_jump::jump_continue:
1139 emit(BRW_OPCODE_CONTINUE);
1140 break;
1141 }
1142 }
1143
1144
1145 void
1146 vec4_visitor::visit(ir_function_signature *)
1147 {
1148 unreachable("not reached");
1149 }
1150
1151 void
1152 vec4_visitor::visit(ir_function *ir)
1153 {
1154 /* Ignore function bodies other than main() -- we shouldn't see calls to
1155 * them since they should all be inlined.
1156 */
1157 if (strcmp(ir->name, "main") == 0) {
1158 const ir_function_signature *sig;
1159 exec_list empty;
1160
1161 sig = ir->matching_signature(NULL, &empty, false);
1162
1163 assert(sig);
1164
1165 visit_instructions(&sig->body);
1166 }
1167 }
1168
1169 bool
1170 vec4_visitor::try_emit_mad(ir_expression *ir)
1171 {
1172 /* 3-src instructions were introduced in gen6. */
1173 if (devinfo->gen < 6)
1174 return false;
1175
1176 /* MAD can only handle floating-point data. */
1177 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1178 return false;
1179
1180 ir_rvalue *nonmul;
1181 ir_expression *mul;
1182 bool mul_negate, mul_abs;
1183
1184 for (int i = 0; i < 2; i++) {
1185 mul_negate = false;
1186 mul_abs = false;
1187
1188 mul = ir->operands[i]->as_expression();
1189 nonmul = ir->operands[1 - i];
1190
1191 if (mul && mul->operation == ir_unop_abs) {
1192 mul = mul->operands[0]->as_expression();
1193 mul_abs = true;
1194 } else if (mul && mul->operation == ir_unop_neg) {
1195 mul = mul->operands[0]->as_expression();
1196 mul_negate = true;
1197 }
1198
1199 if (mul && mul->operation == ir_binop_mul)
1200 break;
1201 }
1202
1203 if (!mul || mul->operation != ir_binop_mul)
1204 return false;
1205
1206 nonmul->accept(this);
1207 src_reg src0 = fix_3src_operand(this->result);
1208
1209 mul->operands[0]->accept(this);
1210 src_reg src1 = fix_3src_operand(this->result);
1211 src1.negate ^= mul_negate;
1212 src1.abs = mul_abs;
1213 if (mul_abs)
1214 src1.negate = false;
1215
1216 mul->operands[1]->accept(this);
1217 src_reg src2 = fix_3src_operand(this->result);
1218 src2.abs = mul_abs;
1219 if (mul_abs)
1220 src2.negate = false;
1221
1222 this->result = src_reg(this, ir->type);
1223 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1224
1225 return true;
1226 }
1227
1228 bool
1229 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1230 {
1231 /* This optimization relies on CMP setting the destination to 0 when
1232 * false. Early hardware only sets the least significant bit, and
1233 * leaves the other bits undefined. So we can't use it.
1234 */
1235 if (devinfo->gen < 6)
1236 return false;
1237
1238 ir_expression *const cmp = ir->operands[0]->as_expression();
1239
1240 if (cmp == NULL)
1241 return false;
1242
1243 switch (cmp->operation) {
1244 case ir_binop_less:
1245 case ir_binop_greater:
1246 case ir_binop_lequal:
1247 case ir_binop_gequal:
1248 case ir_binop_equal:
1249 case ir_binop_nequal:
1250 break;
1251
1252 default:
1253 return false;
1254 }
1255
1256 cmp->operands[0]->accept(this);
1257 const src_reg cmp_src0 = this->result;
1258
1259 cmp->operands[1]->accept(this);
1260 const src_reg cmp_src1 = this->result;
1261
1262 this->result = src_reg(this, ir->type);
1263
1264 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1265 brw_conditional_for_comparison(cmp->operation)));
1266
1267 /* If the comparison is false, this->result will just happen to be zero.
1268 */
1269 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1270 this->result, src_reg(1.0f));
1271 inst->predicate = BRW_PREDICATE_NORMAL;
1272 inst->predicate_inverse = true;
1273
1274 return true;
1275 }
1276
1277 vec4_instruction *
1278 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1279 src_reg src0, src_reg src1)
1280 {
1281 vec4_instruction *inst;
1282
1283 if (devinfo->gen >= 6) {
1284 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1285 inst->conditional_mod = conditionalmod;
1286 } else {
1287 emit(CMP(dst, src0, src1, conditionalmod));
1288
1289 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1290 inst->predicate = BRW_PREDICATE_NORMAL;
1291 }
1292
1293 return inst;
1294 }
1295
1296 vec4_instruction *
1297 vec4_visitor::emit_lrp(const dst_reg &dst,
1298 const src_reg &x, const src_reg &y, const src_reg &a)
1299 {
1300 if (devinfo->gen >= 6) {
1301 /* Note that the instruction's argument order is reversed from GLSL
1302 * and the IR.
1303 */
1304 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1305 fix_3src_operand(x)));
1306 } else {
1307 /* Earlier generations don't support three source operations, so we
1308 * need to emit x*(1-a) + y*a.
1309 */
1310 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1311 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1312 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1313 y_times_a.writemask = dst.writemask;
1314 one_minus_a.writemask = dst.writemask;
1315 x_times_one_minus_a.writemask = dst.writemask;
1316
1317 emit(MUL(y_times_a, y, a));
1318 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1319 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1320 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1321 }
1322 }
1323
1324 /**
1325 * Emits the instructions needed to perform a pull constant load. before_block
1326 * and before_inst can be NULL in which case the instruction will be appended
1327 * to the end of the instruction list.
1328 */
1329 void
1330 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1331 src_reg surf_index,
1332 src_reg offset_reg,
1333 bblock_t *before_block,
1334 vec4_instruction *before_inst)
1335 {
1336 assert((before_inst == NULL && before_block == NULL) ||
1337 (before_inst && before_block));
1338
1339 vec4_instruction *pull;
1340
1341 if (devinfo->gen >= 9) {
1342 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1343 src_reg header(this, glsl_type::uvec4_type, 2);
1344
1345 pull = new(mem_ctx)
1346 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1347 dst_reg(header));
1348
1349 if (before_inst)
1350 emit_before(before_block, before_inst, pull);
1351 else
1352 emit(pull);
1353
1354 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1355 offset_reg.type);
1356 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1357
1358 if (before_inst)
1359 emit_before(before_block, before_inst, pull);
1360 else
1361 emit(pull);
1362
1363 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1364 dst,
1365 surf_index,
1366 header);
1367 pull->mlen = 2;
1368 pull->header_size = 1;
1369 } else if (devinfo->gen >= 7) {
1370 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1371
1372 grf_offset.type = offset_reg.type;
1373
1374 pull = MOV(grf_offset, offset_reg);
1375
1376 if (before_inst)
1377 emit_before(before_block, before_inst, pull);
1378 else
1379 emit(pull);
1380
1381 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1382 dst,
1383 surf_index,
1384 src_reg(grf_offset));
1385 pull->mlen = 1;
1386 } else {
1387 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1388 dst,
1389 surf_index,
1390 offset_reg);
1391 pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
1392 pull->mlen = 1;
1393 }
1394
1395 if (before_inst)
1396 emit_before(before_block, before_inst, pull);
1397 else
1398 emit(pull);
1399 }
1400
1401 src_reg
1402 vec4_visitor::emit_uniformize(const src_reg &src)
1403 {
1404 const src_reg chan_index(this, glsl_type::uint_type);
1405 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1406 src.type);
1407
1408 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1409 ->force_writemask_all = true;
1410 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1411 ->force_writemask_all = true;
1412
1413 return src_reg(dst);
1414 }
1415
1416 void
1417 vec4_visitor::visit(ir_expression *ir)
1418 {
1419 unsigned int operand;
1420 src_reg op[ARRAY_SIZE(ir->operands)];
1421 vec4_instruction *inst;
1422
1423 if (ir->operation == ir_binop_add) {
1424 if (try_emit_mad(ir))
1425 return;
1426 }
1427
1428 if (ir->operation == ir_unop_b2f) {
1429 if (try_emit_b2f_of_compare(ir))
1430 return;
1431 }
1432
1433 /* Storage for our result. Ideally for an assignment we'd be using
1434 * the actual storage for the result here, instead.
1435 */
1436 dst_reg result_dst(this, ir->type);
1437 src_reg result_src(result_dst);
1438
1439 if (ir->operation == ir_triop_csel) {
1440 ir->operands[1]->accept(this);
1441 op[1] = this->result;
1442 ir->operands[2]->accept(this);
1443 op[2] = this->result;
1444
1445 enum brw_predicate predicate;
1446 emit_bool_to_cond_code(ir->operands[0], &predicate);
1447 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1448 inst->predicate = predicate;
1449 this->result = result_src;
1450 return;
1451 }
1452
1453 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1454 this->result.file = BAD_FILE;
1455 ir->operands[operand]->accept(this);
1456 if (this->result.file == BAD_FILE) {
1457 fprintf(stderr, "Failed to get tree for expression operand:\n");
1458 ir->operands[operand]->fprint(stderr);
1459 exit(1);
1460 }
1461 op[operand] = this->result;
1462
1463 /* Matrix expression operands should have been broken down to vector
1464 * operations already.
1465 */
1466 assert(!ir->operands[operand]->type->is_matrix());
1467 }
1468
1469 /* If nothing special happens, this is the result. */
1470 this->result = result_src;
1471
1472 switch (ir->operation) {
1473 case ir_unop_logic_not:
1474 emit(NOT(result_dst, op[0]));
1475 break;
1476 case ir_unop_neg:
1477 op[0].negate = !op[0].negate;
1478 emit(MOV(result_dst, op[0]));
1479 break;
1480 case ir_unop_abs:
1481 op[0].abs = true;
1482 op[0].negate = false;
1483 emit(MOV(result_dst, op[0]));
1484 break;
1485
1486 case ir_unop_sign:
1487 if (ir->type->is_float()) {
1488 /* AND(val, 0x80000000) gives the sign bit.
1489 *
1490 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1491 * zero.
1492 */
1493 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1494
1495 op[0].type = BRW_REGISTER_TYPE_UD;
1496 result_dst.type = BRW_REGISTER_TYPE_UD;
1497 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1498
1499 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1500 inst->predicate = BRW_PREDICATE_NORMAL;
1501
1502 this->result.type = BRW_REGISTER_TYPE_F;
1503 } else {
1504 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1505 * -> non-negative val generates 0x00000000.
1506 * Predicated OR sets 1 if val is positive.
1507 */
1508 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1509
1510 emit(ASR(result_dst, op[0], src_reg(31)));
1511
1512 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1513 inst->predicate = BRW_PREDICATE_NORMAL;
1514 }
1515 break;
1516
1517 case ir_unop_rcp:
1518 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1519 break;
1520
1521 case ir_unop_exp2:
1522 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1523 break;
1524 case ir_unop_log2:
1525 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1526 break;
1527 case ir_unop_exp:
1528 case ir_unop_log:
1529 unreachable("not reached: should be handled by ir_explog_to_explog2");
1530 case ir_unop_sin:
1531 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1532 break;
1533 case ir_unop_cos:
1534 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1535 break;
1536
1537 case ir_unop_dFdx:
1538 case ir_unop_dFdx_coarse:
1539 case ir_unop_dFdx_fine:
1540 case ir_unop_dFdy:
1541 case ir_unop_dFdy_coarse:
1542 case ir_unop_dFdy_fine:
1543 unreachable("derivatives not valid in vertex shader");
1544
1545 case ir_unop_bitfield_reverse:
1546 emit(BFREV(result_dst, op[0]));
1547 break;
1548 case ir_unop_bit_count:
1549 emit(CBIT(result_dst, op[0]));
1550 break;
1551 case ir_unop_find_msb: {
1552 src_reg temp = src_reg(this, glsl_type::uint_type);
1553
1554 inst = emit(FBH(dst_reg(temp), op[0]));
1555 inst->dst.writemask = WRITEMASK_XYZW;
1556
1557 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1558 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1559 * subtract the result from 31 to convert the MSB count into an LSB count.
1560 */
1561
1562 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1563 temp.swizzle = BRW_SWIZZLE_NOOP;
1564 emit(MOV(result_dst, temp));
1565
1566 src_reg src_tmp = src_reg(result_dst);
1567 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1568
1569 src_tmp.negate = true;
1570 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1571 inst->predicate = BRW_PREDICATE_NORMAL;
1572 break;
1573 }
1574 case ir_unop_find_lsb:
1575 emit(FBL(result_dst, op[0]));
1576 break;
1577 case ir_unop_saturate:
1578 inst = emit(MOV(result_dst, op[0]));
1579 inst->saturate = true;
1580 break;
1581
1582 case ir_unop_noise:
1583 unreachable("not reached: should be handled by lower_noise");
1584
1585 case ir_unop_subroutine_to_int:
1586 emit(MOV(result_dst, op[0]));
1587 break;
1588
1589 case ir_unop_ssbo_unsized_array_length:
1590 unreachable("not reached: should be handled by lower_ubo_reference");
1591 break;
1592
1593 case ir_binop_add:
1594 emit(ADD(result_dst, op[0], op[1]));
1595 break;
1596 case ir_binop_sub:
1597 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1598
1599 case ir_binop_mul:
1600 if (devinfo->gen < 8 && ir->type->is_integer()) {
1601 /* For integer multiplication, the MUL uses the low 16 bits of one of
1602 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1603 * accumulates in the contribution of the upper 16 bits of that
1604 * operand. If we can determine that one of the args is in the low
1605 * 16 bits, though, we can just emit a single MUL.
1606 */
1607 if (ir->operands[0]->is_uint16_constant()) {
1608 if (devinfo->gen < 7)
1609 emit(MUL(result_dst, op[0], op[1]));
1610 else
1611 emit(MUL(result_dst, op[1], op[0]));
1612 } else if (ir->operands[1]->is_uint16_constant()) {
1613 if (devinfo->gen < 7)
1614 emit(MUL(result_dst, op[1], op[0]));
1615 else
1616 emit(MUL(result_dst, op[0], op[1]));
1617 } else {
1618 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1619
1620 emit(MUL(acc, op[0], op[1]));
1621 emit(MACH(dst_null_d(), op[0], op[1]));
1622 emit(MOV(result_dst, src_reg(acc)));
1623 }
1624 } else {
1625 emit(MUL(result_dst, op[0], op[1]));
1626 }
1627 break;
1628 case ir_binop_imul_high: {
1629 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1630
1631 emit(MUL(acc, op[0], op[1]));
1632 emit(MACH(result_dst, op[0], op[1]));
1633 break;
1634 }
1635 case ir_binop_div:
1636 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1637 assert(ir->type->is_integer());
1638 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1639 break;
1640
1641 case ir_binop_carry:
1642 unreachable("Should have been lowered by carry_to_arith().");
1643
1644 case ir_binop_borrow:
1645 unreachable("Should have been lowered by borrow_to_arith().");
1646
1647 case ir_binop_mod:
1648 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1649 assert(ir->type->is_integer());
1650 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1651 break;
1652
1653 case ir_binop_less:
1654 case ir_binop_greater:
1655 case ir_binop_lequal:
1656 case ir_binop_gequal:
1657 case ir_binop_equal:
1658 case ir_binop_nequal: {
1659 if (devinfo->gen <= 5) {
1660 resolve_bool_comparison(ir->operands[0], &op[0]);
1661 resolve_bool_comparison(ir->operands[1], &op[1]);
1662 }
1663 emit(CMP(result_dst, op[0], op[1],
1664 brw_conditional_for_comparison(ir->operation)));
1665 break;
1666 }
1667
1668 case ir_binop_all_equal:
1669 if (devinfo->gen <= 5) {
1670 resolve_bool_comparison(ir->operands[0], &op[0]);
1671 resolve_bool_comparison(ir->operands[1], &op[1]);
1672 }
1673
1674 /* "==" operator producing a scalar boolean. */
1675 if (ir->operands[0]->type->is_vector() ||
1676 ir->operands[1]->type->is_vector()) {
1677 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1678 emit(MOV(result_dst, src_reg(0)));
1679 inst = emit(MOV(result_dst, src_reg(~0)));
1680 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1681 } else {
1682 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1683 }
1684 break;
1685 case ir_binop_any_nequal:
1686 if (devinfo->gen <= 5) {
1687 resolve_bool_comparison(ir->operands[0], &op[0]);
1688 resolve_bool_comparison(ir->operands[1], &op[1]);
1689 }
1690
1691 /* "!=" operator producing a scalar boolean. */
1692 if (ir->operands[0]->type->is_vector() ||
1693 ir->operands[1]->type->is_vector()) {
1694 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1695
1696 emit(MOV(result_dst, src_reg(0)));
1697 inst = emit(MOV(result_dst, src_reg(~0)));
1698 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1699 } else {
1700 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1701 }
1702 break;
1703
1704 case ir_unop_any:
1705 if (devinfo->gen <= 5) {
1706 resolve_bool_comparison(ir->operands[0], &op[0]);
1707 }
1708 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1709 emit(MOV(result_dst, src_reg(0)));
1710
1711 inst = emit(MOV(result_dst, src_reg(~0)));
1712 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1713 break;
1714
1715 case ir_binop_logic_xor:
1716 emit(XOR(result_dst, op[0], op[1]));
1717 break;
1718
1719 case ir_binop_logic_or:
1720 emit(OR(result_dst, op[0], op[1]));
1721 break;
1722
1723 case ir_binop_logic_and:
1724 emit(AND(result_dst, op[0], op[1]));
1725 break;
1726
1727 case ir_binop_dot:
1728 assert(ir->operands[0]->type->is_vector());
1729 assert(ir->operands[0]->type == ir->operands[1]->type);
1730 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1731 break;
1732
1733 case ir_unop_sqrt:
1734 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1735 break;
1736 case ir_unop_rsq:
1737 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1738 break;
1739
1740 case ir_unop_bitcast_i2f:
1741 case ir_unop_bitcast_u2f:
1742 this->result = op[0];
1743 this->result.type = BRW_REGISTER_TYPE_F;
1744 break;
1745
1746 case ir_unop_bitcast_f2i:
1747 this->result = op[0];
1748 this->result.type = BRW_REGISTER_TYPE_D;
1749 break;
1750
1751 case ir_unop_bitcast_f2u:
1752 this->result = op[0];
1753 this->result.type = BRW_REGISTER_TYPE_UD;
1754 break;
1755
1756 case ir_unop_i2f:
1757 case ir_unop_i2u:
1758 case ir_unop_u2i:
1759 case ir_unop_u2f:
1760 case ir_unop_f2i:
1761 case ir_unop_f2u:
1762 emit(MOV(result_dst, op[0]));
1763 break;
1764 case ir_unop_b2i:
1765 case ir_unop_b2f:
1766 if (devinfo->gen <= 5) {
1767 resolve_bool_comparison(ir->operands[0], &op[0]);
1768 }
1769 emit(MOV(result_dst, negate(op[0])));
1770 break;
1771 case ir_unop_f2b:
1772 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1773 break;
1774 case ir_unop_i2b:
1775 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1776 break;
1777
1778 case ir_unop_trunc:
1779 emit(RNDZ(result_dst, op[0]));
1780 break;
1781 case ir_unop_ceil: {
1782 src_reg tmp = src_reg(this, ir->type);
1783 op[0].negate = !op[0].negate;
1784 emit(RNDD(dst_reg(tmp), op[0]));
1785 tmp.negate = true;
1786 emit(MOV(result_dst, tmp));
1787 }
1788 break;
1789 case ir_unop_floor:
1790 inst = emit(RNDD(result_dst, op[0]));
1791 break;
1792 case ir_unop_fract:
1793 inst = emit(FRC(result_dst, op[0]));
1794 break;
1795 case ir_unop_round_even:
1796 emit(RNDE(result_dst, op[0]));
1797 break;
1798
1799 case ir_unop_get_buffer_size:
1800 unreachable("not reached: not implemented");
1801 break;
1802
1803 case ir_binop_min:
1804 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1805 break;
1806 case ir_binop_max:
1807 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1808 break;
1809
1810 case ir_binop_pow:
1811 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1812 break;
1813
1814 case ir_unop_bit_not:
1815 inst = emit(NOT(result_dst, op[0]));
1816 break;
1817 case ir_binop_bit_and:
1818 inst = emit(AND(result_dst, op[0], op[1]));
1819 break;
1820 case ir_binop_bit_xor:
1821 inst = emit(XOR(result_dst, op[0], op[1]));
1822 break;
1823 case ir_binop_bit_or:
1824 inst = emit(OR(result_dst, op[0], op[1]));
1825 break;
1826
1827 case ir_binop_lshift:
1828 inst = emit(SHL(result_dst, op[0], op[1]));
1829 break;
1830
1831 case ir_binop_rshift:
1832 if (ir->type->base_type == GLSL_TYPE_INT)
1833 inst = emit(ASR(result_dst, op[0], op[1]));
1834 else
1835 inst = emit(SHR(result_dst, op[0], op[1]));
1836 break;
1837
1838 case ir_binop_bfm:
1839 emit(BFI1(result_dst, op[0], op[1]));
1840 break;
1841
1842 case ir_binop_ubo_load: {
1843 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1844 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1845 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1846 src_reg offset;
1847
1848 /* Now, load the vector from that offset. */
1849 assert(ir->type->is_vector() || ir->type->is_scalar());
1850
1851 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1852 packed_consts.type = result.type;
1853 src_reg surf_index;
1854
1855 if (const_uniform_block) {
1856 /* The block index is a constant, so just emit the binding table entry
1857 * as an immediate.
1858 */
1859 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1860 const_uniform_block->value.u[0]);
1861 } else {
1862 /* The block index is not a constant. Evaluate the index expression
1863 * per-channel and add the base UBO index; we have to select a value
1864 * from any live channel.
1865 */
1866 surf_index = src_reg(this, glsl_type::uint_type);
1867 emit(ADD(dst_reg(surf_index), op[0],
1868 src_reg(prog_data->base.binding_table.ubo_start)));
1869 surf_index = emit_uniformize(surf_index);
1870
1871 /* Assume this may touch any UBO. It would be nice to provide
1872 * a tighter bound, but the array information is already lowered away.
1873 */
1874 brw_mark_surface_used(&prog_data->base,
1875 prog_data->base.binding_table.ubo_start +
1876 shader_prog->NumBufferInterfaceBlocks - 1);
1877 }
1878
1879 if (const_offset_ir) {
1880 if (devinfo->gen >= 8) {
1881 /* Store the offset in a GRF so we can send-from-GRF. */
1882 offset = src_reg(this, glsl_type::int_type);
1883 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1884 } else {
1885 /* Immediates are fine on older generations since they'll be moved
1886 * to a (potentially fake) MRF at the generator level.
1887 */
1888 offset = src_reg(const_offset / 16);
1889 }
1890 } else {
1891 offset = src_reg(this, glsl_type::uint_type);
1892 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1893 }
1894
1895 emit_pull_constant_load_reg(dst_reg(packed_consts),
1896 surf_index,
1897 offset,
1898 NULL, NULL /* before_block/inst */);
1899
1900 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1901 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1902 const_offset % 16 / 4,
1903 const_offset % 16 / 4,
1904 const_offset % 16 / 4);
1905
1906 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1907 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1908 emit(CMP(result_dst, packed_consts, src_reg(0u),
1909 BRW_CONDITIONAL_NZ));
1910 } else {
1911 emit(MOV(result_dst, packed_consts));
1912 }
1913 break;
1914 }
1915
1916 case ir_binop_vector_extract:
1917 unreachable("should have been lowered by vec_index_to_cond_assign");
1918
1919 case ir_triop_fma:
1920 op[0] = fix_3src_operand(op[0]);
1921 op[1] = fix_3src_operand(op[1]);
1922 op[2] = fix_3src_operand(op[2]);
1923 /* Note that the instruction's argument order is reversed from GLSL
1924 * and the IR.
1925 */
1926 emit(MAD(result_dst, op[2], op[1], op[0]));
1927 break;
1928
1929 case ir_triop_lrp:
1930 emit_lrp(result_dst, op[0], op[1], op[2]);
1931 break;
1932
1933 case ir_triop_csel:
1934 unreachable("already handled above");
1935 break;
1936
1937 case ir_triop_bfi:
1938 op[0] = fix_3src_operand(op[0]);
1939 op[1] = fix_3src_operand(op[1]);
1940 op[2] = fix_3src_operand(op[2]);
1941 emit(BFI2(result_dst, op[0], op[1], op[2]));
1942 break;
1943
1944 case ir_triop_bitfield_extract:
1945 op[0] = fix_3src_operand(op[0]);
1946 op[1] = fix_3src_operand(op[1]);
1947 op[2] = fix_3src_operand(op[2]);
1948 /* Note that the instruction's argument order is reversed from GLSL
1949 * and the IR.
1950 */
1951 emit(BFE(result_dst, op[2], op[1], op[0]));
1952 break;
1953
1954 case ir_triop_vector_insert:
1955 unreachable("should have been lowered by lower_vector_insert");
1956
1957 case ir_quadop_bitfield_insert:
1958 unreachable("not reached: should be handled by "
1959 "bitfield_insert_to_bfm_bfi\n");
1960
1961 case ir_quadop_vector:
1962 unreachable("not reached: should be handled by lower_quadop_vector");
1963
1964 case ir_unop_pack_half_2x16:
1965 emit_pack_half_2x16(result_dst, op[0]);
1966 break;
1967 case ir_unop_unpack_half_2x16:
1968 emit_unpack_half_2x16(result_dst, op[0]);
1969 break;
1970 case ir_unop_unpack_unorm_4x8:
1971 emit_unpack_unorm_4x8(result_dst, op[0]);
1972 break;
1973 case ir_unop_unpack_snorm_4x8:
1974 emit_unpack_snorm_4x8(result_dst, op[0]);
1975 break;
1976 case ir_unop_pack_unorm_4x8:
1977 emit_pack_unorm_4x8(result_dst, op[0]);
1978 break;
1979 case ir_unop_pack_snorm_4x8:
1980 emit_pack_snorm_4x8(result_dst, op[0]);
1981 break;
1982 case ir_unop_pack_snorm_2x16:
1983 case ir_unop_pack_unorm_2x16:
1984 case ir_unop_unpack_snorm_2x16:
1985 case ir_unop_unpack_unorm_2x16:
1986 unreachable("not reached: should be handled by lower_packing_builtins");
1987 case ir_unop_unpack_half_2x16_split_x:
1988 case ir_unop_unpack_half_2x16_split_y:
1989 case ir_binop_pack_half_2x16_split:
1990 case ir_unop_interpolate_at_centroid:
1991 case ir_binop_interpolate_at_sample:
1992 case ir_binop_interpolate_at_offset:
1993 unreachable("not reached: should not occur in vertex shader");
1994 case ir_binop_ldexp:
1995 unreachable("not reached: should be handled by ldexp_to_arith()");
1996 case ir_unop_d2f:
1997 case ir_unop_f2d:
1998 case ir_unop_d2i:
1999 case ir_unop_i2d:
2000 case ir_unop_d2u:
2001 case ir_unop_u2d:
2002 case ir_unop_d2b:
2003 case ir_unop_pack_double_2x32:
2004 case ir_unop_unpack_double_2x32:
2005 case ir_unop_frexp_sig:
2006 case ir_unop_frexp_exp:
2007 unreachable("fp64 todo");
2008 }
2009 }
2010
2011
2012 void
2013 vec4_visitor::visit(ir_swizzle *ir)
2014 {
2015 /* Note that this is only swizzles in expressions, not those on the left
2016 * hand side of an assignment, which do write masking. See ir_assignment
2017 * for that.
2018 */
2019 const unsigned swz = brw_compose_swizzle(
2020 brw_swizzle_for_size(ir->type->vector_elements),
2021 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2022
2023 ir->val->accept(this);
2024 this->result = swizzle(this->result, swz);
2025 }
2026
2027 void
2028 vec4_visitor::visit(ir_dereference_variable *ir)
2029 {
2030 const struct glsl_type *type = ir->type;
2031 dst_reg *reg = variable_storage(ir->var);
2032
2033 if (!reg) {
2034 fail("Failed to find variable storage for %s\n", ir->var->name);
2035 this->result = src_reg(brw_null_reg());
2036 return;
2037 }
2038
2039 this->result = src_reg(*reg);
2040
2041 /* System values get their swizzle from the dst_reg writemask */
2042 if (ir->var->data.mode == ir_var_system_value)
2043 return;
2044
2045 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2046 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2047 }
2048
2049
2050 int
2051 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2052 {
2053 /* Under normal circumstances array elements are stored consecutively, so
2054 * the stride is equal to the size of the array element.
2055 */
2056 return type_size_vec4(ir->type);
2057 }
2058
2059
2060 void
2061 vec4_visitor::visit(ir_dereference_array *ir)
2062 {
2063 ir_constant *constant_index;
2064 src_reg src;
2065 int array_stride = compute_array_stride(ir);
2066
2067 constant_index = ir->array_index->constant_expression_value();
2068
2069 ir->array->accept(this);
2070 src = this->result;
2071
2072 if (constant_index) {
2073 src.reg_offset += constant_index->value.i[0] * array_stride;
2074 } else {
2075 /* Variable index array dereference. It eats the "vec4" of the
2076 * base of the array and an index that offsets the Mesa register
2077 * index.
2078 */
2079 ir->array_index->accept(this);
2080
2081 src_reg index_reg;
2082
2083 if (array_stride == 1) {
2084 index_reg = this->result;
2085 } else {
2086 index_reg = src_reg(this, glsl_type::int_type);
2087
2088 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2089 }
2090
2091 if (src.reladdr) {
2092 src_reg temp = src_reg(this, glsl_type::int_type);
2093
2094 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2095
2096 index_reg = temp;
2097 }
2098
2099 src.reladdr = ralloc(mem_ctx, src_reg);
2100 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2101 }
2102
2103 /* If the type is smaller than a vec4, replicate the last channel out. */
2104 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2105 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2106 else
2107 src.swizzle = BRW_SWIZZLE_NOOP;
2108 src.type = brw_type_for_base_type(ir->type);
2109
2110 this->result = src;
2111 }
2112
2113 void
2114 vec4_visitor::visit(ir_dereference_record *ir)
2115 {
2116 unsigned int i;
2117 const glsl_type *struct_type = ir->record->type;
2118 int offset = 0;
2119
2120 ir->record->accept(this);
2121
2122 for (i = 0; i < struct_type->length; i++) {
2123 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2124 break;
2125 offset += type_size_vec4(struct_type->fields.structure[i].type);
2126 }
2127
2128 /* If the type is smaller than a vec4, replicate the last channel out. */
2129 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2130 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2131 else
2132 this->result.swizzle = BRW_SWIZZLE_NOOP;
2133 this->result.type = brw_type_for_base_type(ir->type);
2134
2135 this->result.reg_offset += offset;
2136 }
2137
2138 /**
2139 * We want to be careful in assignment setup to hit the actual storage
2140 * instead of potentially using a temporary like we might with the
2141 * ir_dereference handler.
2142 */
2143 static dst_reg
2144 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2145 {
2146 /* The LHS must be a dereference. If the LHS is a variable indexed array
2147 * access of a vector, it must be separated into a series conditional moves
2148 * before reaching this point (see ir_vec_index_to_cond_assign).
2149 */
2150 assert(ir->as_dereference());
2151 ir_dereference_array *deref_array = ir->as_dereference_array();
2152 if (deref_array) {
2153 assert(!deref_array->array->type->is_vector());
2154 }
2155
2156 /* Use the rvalue deref handler for the most part. We'll ignore
2157 * swizzles in it and write swizzles using writemask, though.
2158 */
2159 ir->accept(v);
2160 return dst_reg(v->result);
2161 }
2162
2163 void
2164 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2165 const struct glsl_type *type,
2166 enum brw_predicate predicate)
2167 {
2168 if (type->base_type == GLSL_TYPE_STRUCT) {
2169 for (unsigned int i = 0; i < type->length; i++) {
2170 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2171 }
2172 return;
2173 }
2174
2175 if (type->is_array()) {
2176 for (unsigned int i = 0; i < type->length; i++) {
2177 emit_block_move(dst, src, type->fields.array, predicate);
2178 }
2179 return;
2180 }
2181
2182 if (type->is_matrix()) {
2183 const struct glsl_type *vec_type;
2184
2185 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2186 type->vector_elements, 1);
2187
2188 for (int i = 0; i < type->matrix_columns; i++) {
2189 emit_block_move(dst, src, vec_type, predicate);
2190 }
2191 return;
2192 }
2193
2194 assert(type->is_scalar() || type->is_vector());
2195
2196 dst->type = brw_type_for_base_type(type);
2197 src->type = dst->type;
2198
2199 dst->writemask = (1 << type->vector_elements) - 1;
2200
2201 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2202
2203 vec4_instruction *inst = emit(MOV(*dst, *src));
2204 inst->predicate = predicate;
2205
2206 dst->reg_offset++;
2207 src->reg_offset++;
2208 }
2209
2210
2211 /* If the RHS processing resulted in an instruction generating a
2212 * temporary value, and it would be easy to rewrite the instruction to
2213 * generate its result right into the LHS instead, do so. This ends
2214 * up reliably removing instructions where it can be tricky to do so
2215 * later without real UD chain information.
2216 */
2217 bool
2218 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2219 dst_reg dst,
2220 src_reg src,
2221 vec4_instruction *pre_rhs_inst,
2222 vec4_instruction *last_rhs_inst)
2223 {
2224 /* This could be supported, but it would take more smarts. */
2225 if (ir->condition)
2226 return false;
2227
2228 if (pre_rhs_inst == last_rhs_inst)
2229 return false; /* No instructions generated to work with. */
2230
2231 /* Make sure the last instruction generated our source reg. */
2232 if (src.file != GRF ||
2233 src.file != last_rhs_inst->dst.file ||
2234 src.reg != last_rhs_inst->dst.reg ||
2235 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2236 src.reladdr ||
2237 src.abs ||
2238 src.negate ||
2239 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2240 return false;
2241
2242 /* Check that that last instruction fully initialized the channels
2243 * we want to use, in the order we want to use them. We could
2244 * potentially reswizzle the operands of many instructions so that
2245 * we could handle out of order channels, but don't yet.
2246 */
2247
2248 for (unsigned i = 0; i < 4; i++) {
2249 if (dst.writemask & (1 << i)) {
2250 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2251 return false;
2252
2253 if (BRW_GET_SWZ(src.swizzle, i) != i)
2254 return false;
2255 }
2256 }
2257
2258 /* Success! Rewrite the instruction. */
2259 last_rhs_inst->dst.file = dst.file;
2260 last_rhs_inst->dst.reg = dst.reg;
2261 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2262 last_rhs_inst->dst.reladdr = dst.reladdr;
2263 last_rhs_inst->dst.writemask &= dst.writemask;
2264
2265 return true;
2266 }
2267
2268 void
2269 vec4_visitor::visit(ir_assignment *ir)
2270 {
2271 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2272 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2273
2274 if (!ir->lhs->type->is_scalar() &&
2275 !ir->lhs->type->is_vector()) {
2276 ir->rhs->accept(this);
2277 src_reg src = this->result;
2278
2279 if (ir->condition) {
2280 emit_bool_to_cond_code(ir->condition, &predicate);
2281 }
2282
2283 /* emit_block_move doesn't account for swizzles in the source register.
2284 * This should be ok, since the source register is a structure or an
2285 * array, and those can't be swizzled. But double-check to be sure.
2286 */
2287 assert(src.swizzle ==
2288 (ir->rhs->type->is_matrix()
2289 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2290 : BRW_SWIZZLE_NOOP));
2291
2292 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2293 return;
2294 }
2295
2296 /* Now we're down to just a scalar/vector with writemasks. */
2297 int i;
2298
2299 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2300 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2301
2302 ir->rhs->accept(this);
2303
2304 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2305
2306 int swizzles[4];
2307 int src_chan = 0;
2308
2309 assert(ir->lhs->type->is_vector() ||
2310 ir->lhs->type->is_scalar());
2311 dst.writemask = ir->write_mask;
2312
2313 /* Swizzle a small RHS vector into the channels being written.
2314 *
2315 * glsl ir treats write_mask as dictating how many channels are
2316 * present on the RHS while in our instructions we need to make
2317 * those channels appear in the slots of the vec4 they're written to.
2318 */
2319 for (int i = 0; i < 4; i++)
2320 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2321
2322 src_reg src = swizzle(this->result,
2323 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2324 swizzles[2], swizzles[3]));
2325
2326 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2327 return;
2328 }
2329
2330 if (ir->condition) {
2331 emit_bool_to_cond_code(ir->condition, &predicate);
2332 }
2333
2334 for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2335 vec4_instruction *inst = emit(MOV(dst, src));
2336 inst->predicate = predicate;
2337
2338 dst.reg_offset++;
2339 src.reg_offset++;
2340 }
2341 }
2342
2343 void
2344 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2345 {
2346 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2347 foreach_in_list(ir_constant, field_value, &ir->components) {
2348 emit_constant_values(dst, field_value);
2349 }
2350 return;
2351 }
2352
2353 if (ir->type->is_array()) {
2354 for (unsigned int i = 0; i < ir->type->length; i++) {
2355 emit_constant_values(dst, ir->array_elements[i]);
2356 }
2357 return;
2358 }
2359
2360 if (ir->type->is_matrix()) {
2361 for (int i = 0; i < ir->type->matrix_columns; i++) {
2362 float *vec = &ir->value.f[i * ir->type->vector_elements];
2363
2364 for (int j = 0; j < ir->type->vector_elements; j++) {
2365 dst->writemask = 1 << j;
2366 dst->type = BRW_REGISTER_TYPE_F;
2367
2368 emit(MOV(*dst, src_reg(vec[j])));
2369 }
2370 dst->reg_offset++;
2371 }
2372 return;
2373 }
2374
2375 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2376
2377 for (int i = 0; i < ir->type->vector_elements; i++) {
2378 if (!(remaining_writemask & (1 << i)))
2379 continue;
2380
2381 dst->writemask = 1 << i;
2382 dst->type = brw_type_for_base_type(ir->type);
2383
2384 /* Find other components that match the one we're about to
2385 * write. Emits fewer instructions for things like vec4(0.5,
2386 * 1.5, 1.5, 1.5).
2387 */
2388 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2389 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2390 if (ir->value.b[i] == ir->value.b[j])
2391 dst->writemask |= (1 << j);
2392 } else {
2393 /* u, i, and f storage all line up, so no need for a
2394 * switch case for comparing each type.
2395 */
2396 if (ir->value.u[i] == ir->value.u[j])
2397 dst->writemask |= (1 << j);
2398 }
2399 }
2400
2401 switch (ir->type->base_type) {
2402 case GLSL_TYPE_FLOAT:
2403 emit(MOV(*dst, src_reg(ir->value.f[i])));
2404 break;
2405 case GLSL_TYPE_INT:
2406 emit(MOV(*dst, src_reg(ir->value.i[i])));
2407 break;
2408 case GLSL_TYPE_UINT:
2409 emit(MOV(*dst, src_reg(ir->value.u[i])));
2410 break;
2411 case GLSL_TYPE_BOOL:
2412 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2413 break;
2414 default:
2415 unreachable("Non-float/uint/int/bool constant");
2416 }
2417
2418 remaining_writemask &= ~dst->writemask;
2419 }
2420 dst->reg_offset++;
2421 }
2422
2423 void
2424 vec4_visitor::visit(ir_constant *ir)
2425 {
2426 dst_reg dst = dst_reg(this, ir->type);
2427 this->result = src_reg(dst);
2428
2429 emit_constant_values(&dst, ir);
2430 }
2431
2432 void
2433 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2434 {
2435 ir_dereference *deref = static_cast<ir_dereference *>(
2436 ir->actual_parameters.get_head());
2437 ir_variable *location = deref->variable_referenced();
2438 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2439 location->data.binding);
2440
2441 /* Calculate the surface offset */
2442 src_reg offset(this, glsl_type::uint_type);
2443 ir_dereference_array *deref_array = deref->as_dereference_array();
2444 if (deref_array) {
2445 deref_array->array_index->accept(this);
2446
2447 src_reg tmp(this, glsl_type::uint_type);
2448 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2449 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2450 } else {
2451 offset = location->data.atomic.offset;
2452 }
2453
2454 /* Emit the appropriate machine instruction */
2455 const char *callee = ir->callee->function_name();
2456 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2457
2458 if (!strcmp("__intrinsic_atomic_read", callee)) {
2459 emit_untyped_surface_read(surf_index, dst, offset);
2460
2461 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2462 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2463 src_reg(), src_reg());
2464
2465 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2466 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2467 src_reg(), src_reg());
2468 }
2469
2470 brw_mark_surface_used(stage_prog_data, surf_index);
2471 }
2472
2473 void
2474 vec4_visitor::visit(ir_call *ir)
2475 {
2476 const char *callee = ir->callee->function_name();
2477
2478 if (!strcmp("__intrinsic_atomic_read", callee) ||
2479 !strcmp("__intrinsic_atomic_increment", callee) ||
2480 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2481 visit_atomic_counter_intrinsic(ir);
2482 } else {
2483 unreachable("Unsupported intrinsic.");
2484 }
2485 }
2486
2487 src_reg
2488 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2489 src_reg coordinate, src_reg sampler)
2490 {
2491 vec4_instruction *inst =
2492 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2493 dst_reg(this, glsl_type::uvec4_type));
2494 inst->base_mrf = 2;
2495 inst->src[1] = sampler;
2496
2497 int param_base;
2498
2499 if (devinfo->gen >= 9) {
2500 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2501 vec4_instruction *header_inst = new(mem_ctx)
2502 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2503 dst_reg(MRF, inst->base_mrf));
2504
2505 emit(header_inst);
2506
2507 inst->mlen = 2;
2508 inst->header_size = 1;
2509 param_base = inst->base_mrf + 1;
2510 } else {
2511 inst->mlen = 1;
2512 param_base = inst->base_mrf;
2513 }
2514
2515 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2516 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2517 int zero_mask = 0xf & ~coord_mask;
2518
2519 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2520 coordinate));
2521
2522 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2523 src_reg(0)));
2524
2525 emit(inst);
2526 return src_reg(inst->dst);
2527 }
2528
2529 bool
2530 vec4_visitor::is_high_sampler(src_reg sampler)
2531 {
2532 if (devinfo->gen < 8 && !devinfo->is_haswell)
2533 return false;
2534
2535 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2536 }
2537
2538 void
2539 vec4_visitor::emit_texture(ir_texture_opcode op,
2540 dst_reg dest,
2541 const glsl_type *dest_type,
2542 src_reg coordinate,
2543 int coord_components,
2544 src_reg shadow_comparitor,
2545 src_reg lod, src_reg lod2,
2546 src_reg sample_index,
2547 uint32_t constant_offset,
2548 src_reg offset_value,
2549 src_reg mcs,
2550 bool is_cube_array,
2551 uint32_t sampler,
2552 src_reg sampler_reg)
2553 {
2554 enum opcode opcode;
2555 switch (op) {
2556 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2557 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2558 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2559 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2560 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2561 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2562 case ir_tg4: opcode = offset_value.file != BAD_FILE
2563 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2564 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2565 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
2566 case ir_txb:
2567 unreachable("TXB is not valid for vertex shaders.");
2568 case ir_lod:
2569 unreachable("LOD is not valid for vertex shaders.");
2570 default:
2571 unreachable("Unrecognized tex op");
2572 }
2573
2574 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2575 opcode, dst_reg(this, dest_type));
2576
2577 inst->offset = constant_offset;
2578
2579 /* The message header is necessary for:
2580 * - Gen4 (always)
2581 * - Gen9+ for selecting SIMD4x2
2582 * - Texel offsets
2583 * - Gather channel selection
2584 * - Sampler indices too large to fit in a 4-bit value.
2585 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2586 */
2587 inst->header_size =
2588 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2589 inst->offset != 0 || op == ir_tg4 ||
2590 op == ir_texture_samples ||
2591 is_high_sampler(sampler_reg)) ? 1 : 0;
2592 inst->base_mrf = 2;
2593 inst->mlen = inst->header_size;
2594 inst->dst.writemask = WRITEMASK_XYZW;
2595 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2596
2597 inst->src[1] = sampler_reg;
2598
2599 /* MRF for the first parameter */
2600 int param_base = inst->base_mrf + inst->header_size;
2601
2602 if (op == ir_txs || op == ir_query_levels) {
2603 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2604 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2605 inst->mlen++;
2606 } else if (op == ir_texture_samples) {
2607 inst->dst.writemask = WRITEMASK_X;
2608 } else {
2609 /* Load the coordinate */
2610 /* FINISHME: gl_clamp_mask and saturate */
2611 int coord_mask = (1 << coord_components) - 1;
2612 int zero_mask = 0xf & ~coord_mask;
2613
2614 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2615 coordinate));
2616 inst->mlen++;
2617
2618 if (zero_mask != 0) {
2619 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2620 src_reg(0)));
2621 }
2622 /* Load the shadow comparitor */
2623 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2624 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2625 WRITEMASK_X),
2626 shadow_comparitor));
2627 inst->mlen++;
2628 }
2629
2630 /* Load the LOD info */
2631 if (op == ir_tex || op == ir_txl) {
2632 int mrf, writemask;
2633 if (devinfo->gen >= 5) {
2634 mrf = param_base + 1;
2635 if (shadow_comparitor.file != BAD_FILE) {
2636 writemask = WRITEMASK_Y;
2637 /* mlen already incremented */
2638 } else {
2639 writemask = WRITEMASK_X;
2640 inst->mlen++;
2641 }
2642 } else /* devinfo->gen == 4 */ {
2643 mrf = param_base;
2644 writemask = WRITEMASK_W;
2645 }
2646 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2647 } else if (op == ir_txf) {
2648 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2649 } else if (op == ir_txf_ms) {
2650 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2651 sample_index));
2652 if (devinfo->gen >= 7) {
2653 /* MCS data is in the first channel of `mcs`, but we need to get it into
2654 * the .y channel of the second vec4 of params, so replicate .x across
2655 * the whole vec4 and then mask off everything except .y
2656 */
2657 mcs.swizzle = BRW_SWIZZLE_XXXX;
2658 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2659 mcs));
2660 }
2661 inst->mlen++;
2662 } else if (op == ir_txd) {
2663 const brw_reg_type type = lod.type;
2664
2665 if (devinfo->gen >= 5) {
2666 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2667 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2668 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2669 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2670 inst->mlen++;
2671
2672 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2673 lod.swizzle = BRW_SWIZZLE_ZZZZ;
2674 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2675 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2676 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2677 inst->mlen++;
2678
2679 if (shadow_comparitor.file != BAD_FILE) {
2680 emit(MOV(dst_reg(MRF, param_base + 2,
2681 shadow_comparitor.type, WRITEMASK_Z),
2682 shadow_comparitor));
2683 }
2684 }
2685 } else /* devinfo->gen == 4 */ {
2686 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2687 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2688 inst->mlen += 2;
2689 }
2690 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2691 if (shadow_comparitor.file != BAD_FILE) {
2692 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2693 shadow_comparitor));
2694 }
2695
2696 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2697 offset_value));
2698 inst->mlen++;
2699 }
2700 }
2701
2702 emit(inst);
2703
2704 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2705 * spec requires layers.
2706 */
2707 if (op == ir_txs && is_cube_array) {
2708 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2709 writemask(inst->dst, WRITEMASK_Z),
2710 src_reg(inst->dst), src_reg(6));
2711 }
2712
2713 if (devinfo->gen == 6 && op == ir_tg4) {
2714 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2715 }
2716
2717 swizzle_result(op, dest,
2718 src_reg(inst->dst), sampler, dest_type);
2719 }
2720
2721 void
2722 vec4_visitor::visit(ir_texture *ir)
2723 {
2724 uint32_t sampler =
2725 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2726
2727 ir_rvalue *nonconst_sampler_index =
2728 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2729
2730 /* Handle non-constant sampler array indexing */
2731 src_reg sampler_reg;
2732 if (nonconst_sampler_index) {
2733 /* The highest sampler which may be used by this operation is
2734 * the last element of the array. Mark it here, because the generator
2735 * doesn't have enough information to determine the bound.
2736 */
2737 uint32_t array_size = ir->sampler->as_dereference_array()
2738 ->array->type->array_size();
2739
2740 uint32_t max_used = sampler + array_size - 1;
2741 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2742 max_used += prog_data->base.binding_table.gather_texture_start;
2743 } else {
2744 max_used += prog_data->base.binding_table.texture_start;
2745 }
2746
2747 brw_mark_surface_used(&prog_data->base, max_used);
2748
2749 /* Emit code to evaluate the actual indexing expression */
2750 nonconst_sampler_index->accept(this);
2751 src_reg temp(this, glsl_type::uint_type);
2752 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2753 sampler_reg = emit_uniformize(temp);
2754 } else {
2755 /* Single sampler, or constant array index; the indexing expression
2756 * is just an immediate.
2757 */
2758 sampler_reg = src_reg(sampler);
2759 }
2760
2761 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2762 * emitting anything other than setting up the constant result.
2763 */
2764 if (ir->op == ir_tg4) {
2765 ir_constant *chan = ir->lod_info.component->as_constant();
2766 int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2767 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2768 dst_reg result(this, ir->type);
2769 this->result = src_reg(result);
2770 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2771 return;
2772 }
2773 }
2774
2775 /* Should be lowered by do_lower_texture_projection */
2776 assert(!ir->projector);
2777
2778 /* Should be lowered */
2779 assert(!ir->offset || !ir->offset->type->is_array());
2780
2781 /* Generate code to compute all the subexpression trees. This has to be
2782 * done before loading any values into MRFs for the sampler message since
2783 * generating these values may involve SEND messages that need the MRFs.
2784 */
2785 src_reg coordinate;
2786 int coord_components = 0;
2787 if (ir->coordinate) {
2788 coord_components = ir->coordinate->type->vector_elements;
2789 ir->coordinate->accept(this);
2790 coordinate = this->result;
2791 }
2792
2793 src_reg shadow_comparitor;
2794 if (ir->shadow_comparitor) {
2795 ir->shadow_comparitor->accept(this);
2796 shadow_comparitor = this->result;
2797 }
2798
2799 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2800 src_reg offset_value;
2801 if (has_nonconstant_offset) {
2802 ir->offset->accept(this);
2803 offset_value = src_reg(this->result);
2804 }
2805
2806 src_reg lod, lod2, sample_index, mcs;
2807 switch (ir->op) {
2808 case ir_tex:
2809 lod = src_reg(0.0f);
2810 break;
2811 case ir_txf:
2812 case ir_txl:
2813 case ir_txs:
2814 ir->lod_info.lod->accept(this);
2815 lod = this->result;
2816 break;
2817 case ir_query_levels:
2818 lod = src_reg(0);
2819 break;
2820 case ir_txf_ms:
2821 ir->lod_info.sample_index->accept(this);
2822 sample_index = this->result;
2823
2824 if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2825 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2826 else
2827 mcs = src_reg(0u);
2828 break;
2829 case ir_txd:
2830 ir->lod_info.grad.dPdx->accept(this);
2831 lod = this->result;
2832
2833 ir->lod_info.grad.dPdy->accept(this);
2834 lod2 = this->result;
2835 break;
2836 case ir_txb:
2837 case ir_lod:
2838 case ir_tg4:
2839 case ir_texture_samples:
2840 break;
2841 }
2842
2843 uint32_t constant_offset = 0;
2844 if (ir->offset != NULL && !has_nonconstant_offset) {
2845 constant_offset =
2846 brw_texture_offset(ir->offset->as_constant()->value.i,
2847 ir->offset->type->vector_elements);
2848 }
2849
2850 /* Stuff the channel select bits in the top of the texture offset */
2851 if (ir->op == ir_tg4)
2852 constant_offset |=
2853 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2854 sampler) << 16;
2855
2856 glsl_type const *type = ir->sampler->type;
2857 bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2858 type->sampler_array;
2859
2860 this->result = src_reg(this, ir->type);
2861 dst_reg dest = dst_reg(this->result);
2862
2863 emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2864 shadow_comparitor,
2865 lod, lod2, sample_index,
2866 constant_offset, offset_value,
2867 mcs, is_cube_array, sampler, sampler_reg);
2868 }
2869
2870 /**
2871 * Apply workarounds for Gen6 gather with UINT/SINT
2872 */
2873 void
2874 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2875 {
2876 if (!wa)
2877 return;
2878
2879 int width = (wa & WA_8BIT) ? 8 : 16;
2880 dst_reg dst_f = dst;
2881 dst_f.type = BRW_REGISTER_TYPE_F;
2882
2883 /* Convert from UNORM to UINT */
2884 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2885 emit(MOV(dst, src_reg(dst_f)));
2886
2887 if (wa & WA_SIGN) {
2888 /* Reinterpret the UINT value as a signed INT value by
2889 * shifting the sign bit into place, then shifting back
2890 * preserving sign.
2891 */
2892 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2893 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2894 }
2895 }
2896
2897 /**
2898 * Set up the gather channel based on the swizzle, for gather4.
2899 */
2900 uint32_t
2901 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2902 {
2903 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2904 switch (swiz) {
2905 case SWIZZLE_X: return 0;
2906 case SWIZZLE_Y:
2907 /* gather4 sampler is broken for green channel on RG32F --
2908 * we must ask for blue instead.
2909 */
2910 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2911 return 2;
2912 return 1;
2913 case SWIZZLE_Z: return 2;
2914 case SWIZZLE_W: return 3;
2915 default:
2916 unreachable("Not reached"); /* zero, one swizzles handled already */
2917 }
2918 }
2919
2920 void
2921 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2922 src_reg orig_val, uint32_t sampler,
2923 const glsl_type *dest_type)
2924 {
2925 int s = key_tex->swizzles[sampler];
2926
2927 dst_reg swizzled_result = dest;
2928
2929 if (op == ir_query_levels) {
2930 /* # levels is in .w */
2931 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2932 emit(MOV(swizzled_result, orig_val));
2933 return;
2934 }
2935
2936 if (op == ir_txs || dest_type == glsl_type::float_type
2937 || s == SWIZZLE_NOOP || op == ir_tg4) {
2938 emit(MOV(swizzled_result, orig_val));
2939 return;
2940 }
2941
2942
2943 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2944 int swizzle[4] = {0};
2945
2946 for (int i = 0; i < 4; i++) {
2947 switch (GET_SWZ(s, i)) {
2948 case SWIZZLE_ZERO:
2949 zero_mask |= (1 << i);
2950 break;
2951 case SWIZZLE_ONE:
2952 one_mask |= (1 << i);
2953 break;
2954 default:
2955 copy_mask |= (1 << i);
2956 swizzle[i] = GET_SWZ(s, i);
2957 break;
2958 }
2959 }
2960
2961 if (copy_mask) {
2962 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2963 swizzled_result.writemask = copy_mask;
2964 emit(MOV(swizzled_result, orig_val));
2965 }
2966
2967 if (zero_mask) {
2968 swizzled_result.writemask = zero_mask;
2969 emit(MOV(swizzled_result, src_reg(0.0f)));
2970 }
2971
2972 if (one_mask) {
2973 swizzled_result.writemask = one_mask;
2974 emit(MOV(swizzled_result, src_reg(1.0f)));
2975 }
2976 }
2977
2978 void
2979 vec4_visitor::visit(ir_return *)
2980 {
2981 unreachable("not reached");
2982 }
2983
2984 void
2985 vec4_visitor::visit(ir_discard *)
2986 {
2987 unreachable("not reached");
2988 }
2989
2990 void
2991 vec4_visitor::visit(ir_if *ir)
2992 {
2993 /* Don't point the annotation at the if statement, because then it plus
2994 * the then and else blocks get printed.
2995 */
2996 this->base_ir = ir->condition;
2997
2998 if (devinfo->gen == 6) {
2999 emit_if_gen6(ir);
3000 } else {
3001 enum brw_predicate predicate;
3002 emit_bool_to_cond_code(ir->condition, &predicate);
3003 emit(IF(predicate));
3004 }
3005
3006 visit_instructions(&ir->then_instructions);
3007
3008 if (!ir->else_instructions.is_empty()) {
3009 this->base_ir = ir->condition;
3010 emit(BRW_OPCODE_ELSE);
3011
3012 visit_instructions(&ir->else_instructions);
3013 }
3014
3015 this->base_ir = ir->condition;
3016 emit(BRW_OPCODE_ENDIF);
3017 }
3018
3019 void
3020 vec4_visitor::gs_emit_vertex(int stream_id)
3021 {
3022 unreachable("not reached");
3023 }
3024
3025 void
3026 vec4_visitor::visit(ir_emit_vertex *)
3027 {
3028 unreachable("not reached");
3029 }
3030
3031 void
3032 vec4_visitor::gs_end_primitive()
3033 {
3034 unreachable("not reached");
3035 }
3036
3037
3038 void
3039 vec4_visitor::visit(ir_end_primitive *)
3040 {
3041 unreachable("not reached");
3042 }
3043
3044 void
3045 vec4_visitor::visit(ir_barrier *)
3046 {
3047 unreachable("not reached");
3048 }
3049
3050 void
3051 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3052 dst_reg dst, src_reg offset,
3053 src_reg src0, src_reg src1)
3054 {
3055 unsigned mlen = 0;
3056
3057 /* Set the atomic operation offset. */
3058 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3059 mlen++;
3060
3061 /* Set the atomic operation arguments. */
3062 if (src0.file != BAD_FILE) {
3063 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3064 mlen++;
3065 }
3066
3067 if (src1.file != BAD_FILE) {
3068 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3069 mlen++;
3070 }
3071
3072 /* Emit the instruction. Note that this maps to the normal SIMD8
3073 * untyped atomic message on Ivy Bridge, but that's OK because
3074 * unused channels will be masked out.
3075 */
3076 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3077 brw_message_reg(0),
3078 src_reg(surf_index), src_reg(atomic_op));
3079 inst->mlen = mlen;
3080 }
3081
3082 void
3083 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3084 src_reg offset)
3085 {
3086 /* Set the surface read offset. */
3087 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3088
3089 /* Emit the instruction. Note that this maps to the normal SIMD8
3090 * untyped surface read message, but that's OK because unused
3091 * channels will be masked out.
3092 */
3093 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3094 brw_message_reg(0),
3095 src_reg(surf_index), src_reg(1));
3096 inst->mlen = 1;
3097 }
3098
3099 void
3100 vec4_visitor::emit_ndc_computation()
3101 {
3102 /* Get the position */
3103 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3104
3105 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3106 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3107 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3108
3109 current_annotation = "NDC";
3110 dst_reg ndc_w = ndc;
3111 ndc_w.writemask = WRITEMASK_W;
3112 src_reg pos_w = pos;
3113 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3114 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3115
3116 dst_reg ndc_xyz = ndc;
3117 ndc_xyz.writemask = WRITEMASK_XYZ;
3118
3119 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3120 }
3121
3122 void
3123 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3124 {
3125 if (devinfo->gen < 6 &&
3126 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3127 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3128 devinfo->has_negative_rhw_bug)) {
3129 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3130 dst_reg header1_w = header1;
3131 header1_w.writemask = WRITEMASK_W;
3132
3133 emit(MOV(header1, 0u));
3134
3135 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3136 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3137
3138 current_annotation = "Point size";
3139 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3140 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3141 }
3142
3143 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3144 current_annotation = "Clipping flags";
3145 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3146 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3147
3148 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3149 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3150 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3151
3152 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3153 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3154 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3155 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3156 }
3157
3158 /* i965 clipping workaround:
3159 * 1) Test for -ve rhw
3160 * 2) If set,
3161 * set ndc = (0,0,0,0)
3162 * set ucp[6] = 1
3163 *
3164 * Later, clipping will detect ucp[6] and ensure the primitive is
3165 * clipped against all fixed planes.
3166 */
3167 if (devinfo->has_negative_rhw_bug) {
3168 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3169 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3170 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3171 vec4_instruction *inst;
3172 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3173 inst->predicate = BRW_PREDICATE_NORMAL;
3174 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3175 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3176 inst->predicate = BRW_PREDICATE_NORMAL;
3177 }
3178
3179 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3180 } else if (devinfo->gen < 6) {
3181 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3182 } else {
3183 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3184 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3185 dst_reg reg_w = reg;
3186 reg_w.writemask = WRITEMASK_W;
3187 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3188 reg_as_src.type = reg_w.type;
3189 reg_as_src.swizzle = brw_swizzle_for_size(1);
3190 emit(MOV(reg_w, reg_as_src));
3191 }
3192 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3193 dst_reg reg_y = reg;
3194 reg_y.writemask = WRITEMASK_Y;
3195 reg_y.type = BRW_REGISTER_TYPE_D;
3196 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3197 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3198 }
3199 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3200 dst_reg reg_z = reg;
3201 reg_z.writemask = WRITEMASK_Z;
3202 reg_z.type = BRW_REGISTER_TYPE_D;
3203 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3204 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3205 }
3206 }
3207 }
3208
3209 vec4_instruction *
3210 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3211 {
3212 assert(varying < VARYING_SLOT_MAX);
3213 assert(output_reg[varying].type == reg.type);
3214 current_annotation = output_reg_annotation[varying];
3215 /* Copy the register, saturating if necessary */
3216 return emit(MOV(reg, src_reg(output_reg[varying])));
3217 }
3218
3219 void
3220 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3221 {
3222 reg.type = BRW_REGISTER_TYPE_F;
3223 output_reg[varying].type = reg.type;
3224
3225 switch (varying) {
3226 case VARYING_SLOT_PSIZ:
3227 {
3228 /* PSIZ is always in slot 0, and is coupled with other flags. */
3229 current_annotation = "indices, point width, clip flags";
3230 emit_psiz_and_flags(reg);
3231 break;
3232 }
3233 case BRW_VARYING_SLOT_NDC:
3234 current_annotation = "NDC";
3235 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3236 break;
3237 case VARYING_SLOT_POS:
3238 current_annotation = "gl_Position";
3239 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3240 break;
3241 case VARYING_SLOT_EDGE:
3242 /* This is present when doing unfilled polygons. We're supposed to copy
3243 * the edge flag from the user-provided vertex array
3244 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3245 * of that attribute (starts as 1.0f). This is then used in clipping to
3246 * determine which edges should be drawn as wireframe.
3247 */
3248 current_annotation = "edge flag";
3249 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3250 glsl_type::float_type, WRITEMASK_XYZW))));
3251 break;
3252 case BRW_VARYING_SLOT_PAD:
3253 /* No need to write to this slot */
3254 break;
3255 default:
3256 emit_generic_urb_slot(reg, varying);
3257 break;
3258 }
3259 }
3260
3261 static int
3262 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3263 {
3264 if (devinfo->gen >= 6) {
3265 /* URB data written (does not include the message header reg) must
3266 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3267 * section 5.4.3.2.2: URB_INTERLEAVED.
3268 *
3269 * URB entries are allocated on a multiple of 1024 bits, so an
3270 * extra 128 bits written here to make the end align to 256 is
3271 * no problem.
3272 */
3273 if ((mlen % 2) != 1)
3274 mlen++;
3275 }
3276
3277 return mlen;
3278 }
3279
3280
3281 /**
3282 * Generates the VUE payload plus the necessary URB write instructions to
3283 * output it.
3284 *
3285 * The VUE layout is documented in Volume 2a.
3286 */
3287 void
3288 vec4_visitor::emit_vertex()
3289 {
3290 /* MRF 0 is reserved for the debugger, so start with message header
3291 * in MRF 1.
3292 */
3293 int base_mrf = 1;
3294 int mrf = base_mrf;
3295 /* In the process of generating our URB write message contents, we
3296 * may need to unspill a register or load from an array. Those
3297 * reads would use MRFs 14-15.
3298 */
3299 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
3300
3301 /* The following assertion verifies that max_usable_mrf causes an
3302 * even-numbered amount of URB write data, which will meet gen6's
3303 * requirements for length alignment.
3304 */
3305 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3306
3307 /* First mrf is the g0-based message header containing URB handles and
3308 * such.
3309 */
3310 emit_urb_write_header(mrf++);
3311
3312 if (devinfo->gen < 6) {
3313 emit_ndc_computation();
3314 }
3315
3316 /* We may need to split this up into several URB writes, so do them in a
3317 * loop.
3318 */
3319 int slot = 0;
3320 bool complete = false;
3321 do {
3322 /* URB offset is in URB row increments, and each of our MRFs is half of
3323 * one of those, since we're doing interleaved writes.
3324 */
3325 int offset = slot / 2;
3326
3327 mrf = base_mrf + 1;
3328 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3329 emit_urb_slot(dst_reg(MRF, mrf++),
3330 prog_data->vue_map.slot_to_varying[slot]);
3331
3332 /* If this was max_usable_mrf, we can't fit anything more into this
3333 * URB WRITE. Same thing if we reached the maximum length available.
3334 */
3335 if (mrf > max_usable_mrf ||
3336 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
3337 slot++;
3338 break;
3339 }
3340 }
3341
3342 complete = slot >= prog_data->vue_map.num_slots;
3343 current_annotation = "URB write";
3344 vec4_instruction *inst = emit_urb_write_opcode(complete);
3345 inst->base_mrf = base_mrf;
3346 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3347 inst->offset += offset;
3348 } while(!complete);
3349 }
3350
3351
3352 src_reg
3353 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3354 src_reg *reladdr, int reg_offset)
3355 {
3356 /* Because we store the values to scratch interleaved like our
3357 * vertex data, we need to scale the vec4 index by 2.
3358 */
3359 int message_header_scale = 2;
3360
3361 /* Pre-gen6, the message header uses byte offsets instead of vec4
3362 * (16-byte) offset units.
3363 */
3364 if (devinfo->gen < 6)
3365 message_header_scale *= 16;
3366
3367 if (reladdr) {
3368 src_reg index = src_reg(this, glsl_type::int_type);
3369
3370 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3371 src_reg(reg_offset)));
3372 emit_before(block, inst, MUL(dst_reg(index), index,
3373 src_reg(message_header_scale)));
3374
3375 return index;
3376 } else {
3377 return src_reg(reg_offset * message_header_scale);
3378 }
3379 }
3380
3381 src_reg
3382 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3383 src_reg *reladdr, int reg_offset)
3384 {
3385 if (reladdr) {
3386 src_reg index = src_reg(this, glsl_type::int_type);
3387
3388 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3389 src_reg(reg_offset)));
3390
3391 /* Pre-gen6, the message header uses byte offsets instead of vec4
3392 * (16-byte) offset units.
3393 */
3394 if (devinfo->gen < 6) {
3395 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3396 }
3397
3398 return index;
3399 } else if (devinfo->gen >= 8) {
3400 /* Store the offset in a GRF so we can send-from-GRF. */
3401 src_reg offset = src_reg(this, glsl_type::int_type);
3402 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3403 return offset;
3404 } else {
3405 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3406 return src_reg(reg_offset * message_header_scale);
3407 }
3408 }
3409
3410 /**
3411 * Emits an instruction before @inst to load the value named by @orig_src
3412 * from scratch space at @base_offset to @temp.
3413 *
3414 * @base_offset is measured in 32-byte units (the size of a register).
3415 */
3416 void
3417 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3418 dst_reg temp, src_reg orig_src,
3419 int base_offset)
3420 {
3421 int reg_offset = base_offset + orig_src.reg_offset;
3422 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3423 reg_offset);
3424
3425 emit_before(block, inst, SCRATCH_READ(temp, index));
3426 }
3427
3428 /**
3429 * Emits an instruction after @inst to store the value to be written
3430 * to @orig_dst to scratch space at @base_offset, from @temp.
3431 *
3432 * @base_offset is measured in 32-byte units (the size of a register).
3433 */
3434 void
3435 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3436 int base_offset)
3437 {
3438 int reg_offset = base_offset + inst->dst.reg_offset;
3439 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3440 reg_offset);
3441
3442 /* Create a temporary register to store *inst's result in.
3443 *
3444 * We have to be careful in MOVing from our temporary result register in
3445 * the scratch write. If we swizzle from channels of the temporary that
3446 * weren't initialized, it will confuse live interval analysis, which will
3447 * make spilling fail to make progress.
3448 */
3449 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3450 inst->dst.type),
3451 brw_swizzle_for_mask(inst->dst.writemask));
3452 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3453 inst->dst.writemask));
3454 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3455 if (inst->opcode != BRW_OPCODE_SEL)
3456 write->predicate = inst->predicate;
3457 write->ir = inst->ir;
3458 write->annotation = inst->annotation;
3459 inst->insert_after(block, write);
3460
3461 inst->dst.file = temp.file;
3462 inst->dst.reg = temp.reg;
3463 inst->dst.reg_offset = temp.reg_offset;
3464 inst->dst.reladdr = NULL;
3465 }
3466
3467 /**
3468 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3469 * adds the scratch read(s) before \p inst. The function also checks for
3470 * recursive reladdr scratch accesses, issuing the corresponding scratch
3471 * loads and rewriting reladdr references accordingly.
3472 *
3473 * \return \p src if it did not require a scratch load, otherwise, the
3474 * register holding the result of the scratch load that the caller should
3475 * use to rewrite src.
3476 */
3477 src_reg
3478 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3479 vec4_instruction *inst, src_reg src)
3480 {
3481 /* Resolve recursive reladdr scratch access by calling ourselves
3482 * with src.reladdr
3483 */
3484 if (src.reladdr)
3485 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3486 *src.reladdr);
3487
3488 /* Now handle scratch access on src */
3489 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3490 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3491 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3492 src.reg = temp.reg;
3493 src.reg_offset = temp.reg_offset;
3494 src.reladdr = NULL;
3495 }
3496
3497 return src;
3498 }
3499
3500 /**
3501 * We can't generally support array access in GRF space, because a
3502 * single instruction's destination can only span 2 contiguous
3503 * registers. So, we send all GRF arrays that get variable index
3504 * access to scratch space.
3505 */
3506 void
3507 vec4_visitor::move_grf_array_access_to_scratch()
3508 {
3509 int scratch_loc[this->alloc.count];
3510 memset(scratch_loc, -1, sizeof(scratch_loc));
3511
3512 /* First, calculate the set of virtual GRFs that need to be punted
3513 * to scratch due to having any array access on them, and where in
3514 * scratch.
3515 */
3516 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3517 if (inst->dst.file == GRF && inst->dst.reladdr) {
3518 if (scratch_loc[inst->dst.reg] == -1) {
3519 scratch_loc[inst->dst.reg] = last_scratch;
3520 last_scratch += this->alloc.sizes[inst->dst.reg];
3521 }
3522
3523 for (src_reg *iter = inst->dst.reladdr;
3524 iter->reladdr;
3525 iter = iter->reladdr) {
3526 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3527 scratch_loc[iter->reg] = last_scratch;
3528 last_scratch += this->alloc.sizes[iter->reg];
3529 }
3530 }
3531 }
3532
3533 for (int i = 0 ; i < 3; i++) {
3534 for (src_reg *iter = &inst->src[i];
3535 iter->reladdr;
3536 iter = iter->reladdr) {
3537 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3538 scratch_loc[iter->reg] = last_scratch;
3539 last_scratch += this->alloc.sizes[iter->reg];
3540 }
3541 }
3542 }
3543 }
3544
3545 /* Now, for anything that will be accessed through scratch, rewrite
3546 * it to load/store. Note that this is a _safe list walk, because
3547 * we may generate a new scratch_write instruction after the one
3548 * we're processing.
3549 */
3550 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3551 /* Set up the annotation tracking for new generated instructions. */
3552 base_ir = inst->ir;
3553 current_annotation = inst->annotation;
3554
3555 /* First handle scratch access on the dst. Notice we have to handle
3556 * the case where the dst's reladdr also points to scratch space.
3557 */
3558 if (inst->dst.reladdr)
3559 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3560 *inst->dst.reladdr);
3561
3562 /* Now that we have handled any (possibly recursive) reladdr scratch
3563 * accesses for dst we can safely do the scratch write for dst itself
3564 */
3565 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3566 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3567
3568 /* Now handle scratch access on any src. In this case, since inst->src[i]
3569 * already is a src_reg, we can just call emit_resolve_reladdr with
3570 * inst->src[i] and it will take care of handling scratch loads for
3571 * both src and src.reladdr (recursively).
3572 */
3573 for (int i = 0 ; i < 3; i++) {
3574 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3575 inst->src[i]);
3576 }
3577 }
3578 }
3579
3580 /**
3581 * Emits an instruction before @inst to load the value named by @orig_src
3582 * from the pull constant buffer (surface) at @base_offset to @temp.
3583 */
3584 void
3585 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3586 dst_reg temp, src_reg orig_src,
3587 int base_offset)
3588 {
3589 int reg_offset = base_offset + orig_src.reg_offset;
3590 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3591 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3592 reg_offset);
3593
3594 emit_pull_constant_load_reg(temp,
3595 index,
3596 offset,
3597 block, inst);
3598 }
3599
3600 /**
3601 * Implements array access of uniforms by inserting a
3602 * PULL_CONSTANT_LOAD instruction.
3603 *
3604 * Unlike temporary GRF array access (where we don't support it due to
3605 * the difficulty of doing relative addressing on instruction
3606 * destinations), we could potentially do array access of uniforms
3607 * that were loaded in GRF space as push constants. In real-world
3608 * usage we've seen, though, the arrays being used are always larger
3609 * than we could load as push constants, so just always move all
3610 * uniform array access out to a pull constant buffer.
3611 */
3612 void
3613 vec4_visitor::move_uniform_array_access_to_pull_constants()
3614 {
3615 int pull_constant_loc[this->uniforms];
3616 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3617 bool nested_reladdr;
3618
3619 /* Walk through and find array access of uniforms. Put a copy of that
3620 * uniform in the pull constant buffer.
3621 *
3622 * Note that we don't move constant-indexed accesses to arrays. No
3623 * testing has been done of the performance impact of this choice.
3624 */
3625 do {
3626 nested_reladdr = false;
3627
3628 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3629 for (int i = 0 ; i < 3; i++) {
3630 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3631 continue;
3632
3633 int uniform = inst->src[i].reg;
3634
3635 if (inst->src[i].reladdr->reladdr)
3636 nested_reladdr = true; /* will need another pass */
3637
3638 /* If this array isn't already present in the pull constant buffer,
3639 * add it.
3640 */
3641 if (pull_constant_loc[uniform] == -1) {
3642 const gl_constant_value **values =
3643 &stage_prog_data->param[uniform * 4];
3644
3645 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3646
3647 assert(uniform < uniform_array_size);
3648 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3649 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3650 = values[j];
3651 }
3652 }
3653
3654 /* Set up the annotation tracking for new generated instructions. */
3655 base_ir = inst->ir;
3656 current_annotation = inst->annotation;
3657
3658 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3659
3660 emit_pull_constant_load(block, inst, temp, inst->src[i],
3661 pull_constant_loc[uniform]);
3662
3663 inst->src[i].file = temp.file;
3664 inst->src[i].reg = temp.reg;
3665 inst->src[i].reg_offset = temp.reg_offset;
3666 inst->src[i].reladdr = NULL;
3667 }
3668 }
3669 } while (nested_reladdr);
3670
3671 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3672 * no need to track them as larger-than-vec4 objects. This will be
3673 * relied on in cutting out unused uniform vectors from push
3674 * constants.
3675 */
3676 split_uniform_registers();
3677 }
3678
3679 void
3680 vec4_visitor::resolve_ud_negate(src_reg *reg)
3681 {
3682 if (reg->type != BRW_REGISTER_TYPE_UD ||
3683 !reg->negate)
3684 return;
3685
3686 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3687 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3688 *reg = temp;
3689 }
3690
3691 /**
3692 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3693 *
3694 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3695 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3696 */
3697 void
3698 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3699 {
3700 assert(devinfo->gen <= 5);
3701
3702 if (!rvalue->type->is_boolean())
3703 return;
3704
3705 src_reg and_result = src_reg(this, rvalue->type);
3706 src_reg neg_result = src_reg(this, rvalue->type);
3707 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3708 emit(MOV(dst_reg(neg_result), negate(and_result)));
3709 *reg = neg_result;
3710 }
3711
3712 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3713 void *log_data,
3714 struct gl_program *prog,
3715 const struct brw_sampler_prog_key_data *key_tex,
3716 struct brw_vue_prog_data *prog_data,
3717 struct gl_shader_program *shader_prog,
3718 gl_shader_stage stage,
3719 void *mem_ctx,
3720 bool no_spills,
3721 int shader_time_index)
3722 : backend_shader(compiler, log_data, mem_ctx,
3723 shader_prog, prog, &prog_data->base, stage),
3724 key_tex(key_tex),
3725 prog_data(prog_data),
3726 sanity_param_count(0),
3727 fail_msg(NULL),
3728 first_non_payload_grf(0),
3729 need_all_constants_in_pull_buffer(false),
3730 no_spills(no_spills),
3731 shader_time_index(shader_time_index),
3732 last_scratch(0)
3733 {
3734 this->failed = false;
3735
3736 this->base_ir = NULL;
3737 this->current_annotation = NULL;
3738 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3739
3740 this->variable_ht = hash_table_ctor(0,
3741 hash_table_pointer_hash,
3742 hash_table_pointer_compare);
3743
3744 this->virtual_grf_start = NULL;
3745 this->virtual_grf_end = NULL;
3746 this->live_intervals = NULL;
3747
3748 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3749
3750 this->uniforms = 0;
3751
3752 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3753 * at least one. See setup_uniforms() in brw_vec4.cpp.
3754 */
3755 this->uniform_array_size = 1;
3756 if (prog_data) {
3757 this->uniform_array_size =
3758 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3759 }
3760
3761 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3762 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3763 }
3764
3765 vec4_visitor::~vec4_visitor()
3766 {
3767 hash_table_dtor(this->variable_ht);
3768 }
3769
3770
3771 void
3772 vec4_visitor::fail(const char *format, ...)
3773 {
3774 va_list va;
3775 char *msg;
3776
3777 if (failed)
3778 return;
3779
3780 failed = true;
3781
3782 va_start(va, format);
3783 msg = ralloc_vasprintf(mem_ctx, format, va);
3784 va_end(va);
3785 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3786
3787 this->fail_msg = msg;
3788
3789 if (debug_enabled) {
3790 fprintf(stderr, "%s", msg);
3791 }
3792 }
3793
3794 } /* namespace brw */