i965/vec4: Use MRF registers 21-23 for spilling in gen6
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->predicate = BRW_PREDICATE_NONE;
49 this->predicate_inverse = false;
50 this->target = 0;
51 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
52 this->shadow_compare = false;
53 this->ir = NULL;
54 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
55 this->header_size = 0;
56 this->flag_subreg = 0;
57 this->mlen = 0;
58 this->base_mrf = 0;
59 this->offset = 0;
60 this->annotation = NULL;
61 }
62
63 vec4_instruction *
64 vec4_visitor::emit(vec4_instruction *inst)
65 {
66 inst->ir = this->base_ir;
67 inst->annotation = this->current_annotation;
68
69 this->instructions.push_tail(inst);
70
71 return inst;
72 }
73
74 vec4_instruction *
75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
76 vec4_instruction *new_inst)
77 {
78 new_inst->ir = inst->ir;
79 new_inst->annotation = inst->annotation;
80
81 inst->insert_before(block, new_inst);
82
83 return inst;
84 }
85
86 vec4_instruction *
87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
88 const src_reg &src1, const src_reg &src2)
89 {
90 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
91 }
92
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
96 const src_reg &src1)
97 {
98 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
103 {
104 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
109 {
110 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
111 }
112
113 vec4_instruction *
114 vec4_visitor::emit(enum opcode opcode)
115 {
116 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
117 }
118
119 #define ALU1(op) \
120 vec4_instruction * \
121 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
122 { \
123 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
124 }
125
126 #define ALU2(op) \
127 vec4_instruction * \
128 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
129 const src_reg &src1) \
130 { \
131 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
132 src0, src1); \
133 }
134
135 #define ALU2_ACC(op) \
136 vec4_instruction * \
137 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
138 const src_reg &src1) \
139 { \
140 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
141 BRW_OPCODE_##op, dst, src0, src1); \
142 inst->writes_accumulator = true; \
143 return inst; \
144 }
145
146 #define ALU3(op) \
147 vec4_instruction * \
148 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
149 const src_reg &src1, const src_reg &src2) \
150 { \
151 assert(devinfo->gen >= 6); \
152 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
153 src0, src1, src2); \
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU1(F32TO16)
163 ALU1(F16TO32)
164 ALU2(ADD)
165 ALU2(MUL)
166 ALU2_ACC(MACH)
167 ALU2(AND)
168 ALU2(OR)
169 ALU2(XOR)
170 ALU2(DP3)
171 ALU2(DP4)
172 ALU2(DPH)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(MAC)
188
189 /** Gen4 predicated IF. */
190 vec4_instruction *
191 vec4_visitor::IF(enum brw_predicate predicate)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
196 inst->predicate = predicate;
197
198 return inst;
199 }
200
201 /** Gen6 IF with embedded comparison. */
202 vec4_instruction *
203 vec4_visitor::IF(src_reg src0, src_reg src1,
204 enum brw_conditional_mod condition)
205 {
206 assert(devinfo->gen == 6);
207
208 vec4_instruction *inst;
209
210 resolve_ud_negate(&src0);
211 resolve_ud_negate(&src1);
212
213 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
214 src0, src1);
215 inst->conditional_mod = condition;
216
217 return inst;
218 }
219
220 /**
221 * CMP: Sets the low bit of the destination channels with the result
222 * of the comparison, while the upper bits are undefined, and updates
223 * the flag register with the packed 16 bits of the result.
224 */
225 vec4_instruction *
226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
227 enum brw_conditional_mod condition)
228 {
229 vec4_instruction *inst;
230
231 /* Take the instruction:
232 *
233 * CMP null<d> src0<f> src1<f>
234 *
235 * Original gen4 does type conversion to the destination type before
236 * comparison, producing garbage results for floating point comparisons.
237 *
238 * The destination type doesn't matter on newer generations, so we set the
239 * type to match src0 so we can compact the instruction.
240 */
241 dst.type = src0.type;
242 if (dst.file == HW_REG)
243 dst.fixed_hw_reg.type = dst.type;
244
245 resolve_ud_negate(&src0);
246 resolve_ud_negate(&src1);
247
248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249 inst->conditional_mod = condition;
250
251 return inst;
252 }
253
254 vec4_instruction *
255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257 vec4_instruction *inst;
258
259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 dst, index);
261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262 inst->mlen = 2;
263
264 return inst;
265 }
266
267 vec4_instruction *
268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269 const src_reg &index)
270 {
271 vec4_instruction *inst;
272
273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 dst, src, index);
275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276 inst->mlen = 3;
277
278 return inst;
279 }
280
281 void
282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
283 {
284 static enum opcode dot_opcodes[] = {
285 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
286 };
287
288 emit(dot_opcodes[elements - 2], dst, src0, src1);
289 }
290
291 src_reg
292 vec4_visitor::fix_3src_operand(const src_reg &src)
293 {
294 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
295 * able to use vertical stride of zero to replicate the vec4 uniform, like
296 *
297 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
298 *
299 * But you can't, since vertical stride is always four in three-source
300 * instructions. Instead, insert a MOV instruction to do the replication so
301 * that the three-source instruction can consume it.
302 */
303
304 /* The MOV is only needed if the source is a uniform or immediate. */
305 if (src.file != UNIFORM && src.file != IMM)
306 return src;
307
308 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
309 return src;
310
311 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
312 expanded.type = src.type;
313 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
314 return src_reg(expanded);
315 }
316
317 src_reg
318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
319 {
320 if (!src.abs && !src.negate)
321 return src;
322
323 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
324 resolved.type = src.type;
325 emit(MOV(resolved, src));
326
327 return src_reg(resolved);
328 }
329
330 src_reg
331 vec4_visitor::fix_math_operand(const src_reg &src)
332 {
333 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
334 return src;
335
336 /* The gen6 math instruction ignores the source modifiers --
337 * swizzle, abs, negate, and at least some parts of the register
338 * region description.
339 *
340 * Rather than trying to enumerate all these cases, *always* expand the
341 * operand to a temp GRF for gen6.
342 *
343 * For gen7, keep the operand as-is, except if immediate, which gen7 still
344 * can't use.
345 */
346
347 if (devinfo->gen == 7 && src.file != IMM)
348 return src;
349
350 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
351 expanded.type = src.type;
352 emit(MOV(expanded, src));
353 return src_reg(expanded);
354 }
355
356 vec4_instruction *
357 vec4_visitor::emit_math(enum opcode opcode,
358 const dst_reg &dst,
359 const src_reg &src0, const src_reg &src1)
360 {
361 vec4_instruction *math =
362 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
363
364 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
365 /* MATH on Gen6 must be align1, so we can't do writemasks. */
366 math->dst = dst_reg(this, glsl_type::vec4_type);
367 math->dst.type = dst.type;
368 math = emit(MOV(dst, src_reg(math->dst)));
369 } else if (devinfo->gen < 6) {
370 math->base_mrf = 1;
371 math->mlen = src1.file == BAD_FILE ? 1 : 2;
372 }
373
374 return math;
375 }
376
377 void
378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
379 {
380 if (devinfo->gen < 7) {
381 unreachable("ir_unop_pack_half_2x16 should be lowered");
382 }
383
384 assert(dst.type == BRW_REGISTER_TYPE_UD);
385 assert(src0.type == BRW_REGISTER_TYPE_F);
386
387 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
388 *
389 * Because this instruction does not have a 16-bit floating-point type,
390 * the destination data type must be Word (W).
391 *
392 * The destination must be DWord-aligned and specify a horizontal stride
393 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
394 * each destination channel and the upper word is not modified.
395 *
396 * The above restriction implies that the f32to16 instruction must use
397 * align1 mode, because only in align1 mode is it possible to specify
398 * horizontal stride. We choose here to defy the hardware docs and emit
399 * align16 instructions.
400 *
401 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
402 * instructions. I was partially successful in that the code passed all
403 * tests. However, the code was dubiously correct and fragile, and the
404 * tests were not harsh enough to probe that frailty. Not trusting the
405 * code, I chose instead to remain in align16 mode in defiance of the hw
406 * docs).
407 *
408 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
409 * simulator, emitting a f32to16 in align16 mode with UD as destination
410 * data type is safe. The behavior differs from that specified in the PRM
411 * in that the upper word of each destination channel is cleared to 0.
412 */
413
414 dst_reg tmp_dst(this, glsl_type::uvec2_type);
415 src_reg tmp_src(tmp_dst);
416
417 #if 0
418 /* Verify the undocumented behavior on which the following instructions
419 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
420 * then the result of the bit-or instruction below will be incorrect.
421 *
422 * You should inspect the disasm output in order to verify that the MOV is
423 * not optimized away.
424 */
425 emit(MOV(tmp_dst, src_reg(0x12345678u)));
426 #endif
427
428 /* Give tmp the form below, where "." means untouched.
429 *
430 * w z y x w z y x
431 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
432 *
433 * That the upper word of each write-channel be 0 is required for the
434 * following bit-shift and bit-or instructions to work. Note that this
435 * relies on the undocumented hardware behavior mentioned above.
436 */
437 tmp_dst.writemask = WRITEMASK_XY;
438 emit(F32TO16(tmp_dst, src0));
439
440 /* Give the write-channels of dst the form:
441 * 0xhhhh0000
442 */
443 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
444 emit(SHL(dst, tmp_src, src_reg(16u)));
445
446 /* Finally, give the write-channels of dst the form of packHalf2x16's
447 * output:
448 * 0xhhhhllll
449 */
450 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
451 emit(OR(dst, src_reg(dst), tmp_src));
452 }
453
454 void
455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
456 {
457 if (devinfo->gen < 7) {
458 unreachable("ir_unop_unpack_half_2x16 should be lowered");
459 }
460
461 assert(dst.type == BRW_REGISTER_TYPE_F);
462 assert(src0.type == BRW_REGISTER_TYPE_UD);
463
464 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
465 *
466 * Because this instruction does not have a 16-bit floating-point type,
467 * the source data type must be Word (W). The destination type must be
468 * F (Float).
469 *
470 * To use W as the source data type, we must adjust horizontal strides,
471 * which is only possible in align1 mode. All my [chadv] attempts at
472 * emitting align1 instructions for unpackHalf2x16 failed to pass the
473 * Piglit tests, so I gave up.
474 *
475 * I've verified that, on gen7 hardware and the simulator, it is safe to
476 * emit f16to32 in align16 mode with UD as source data type.
477 */
478
479 dst_reg tmp_dst(this, glsl_type::uvec2_type);
480 src_reg tmp_src(tmp_dst);
481
482 tmp_dst.writemask = WRITEMASK_X;
483 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
484
485 tmp_dst.writemask = WRITEMASK_Y;
486 emit(SHR(tmp_dst, src0, src_reg(16u)));
487
488 dst.writemask = WRITEMASK_XY;
489 emit(F16TO32(dst, tmp_src));
490 }
491
492 void
493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497 * is not suitable to generate the shift values, but we can use the packed
498 * vector float and a type-converting MOV.
499 */
500 dst_reg shift(this, glsl_type::uvec4_type);
501 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
502
503 dst_reg shifted(this, glsl_type::uvec4_type);
504 src0.swizzle = BRW_SWIZZLE_XXXX;
505 emit(SHR(shifted, src0, src_reg(shift)));
506
507 shifted.type = BRW_REGISTER_TYPE_UB;
508 dst_reg f(this, glsl_type::vec4_type);
509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510
511 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
512 }
513
514 void
515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
516 {
517 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
518 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
519 * is not suitable to generate the shift values, but we can use the packed
520 * vector float and a type-converting MOV.
521 */
522 dst_reg shift(this, glsl_type::uvec4_type);
523 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
524
525 dst_reg shifted(this, glsl_type::uvec4_type);
526 src0.swizzle = BRW_SWIZZLE_XXXX;
527 emit(SHR(shifted, src0, src_reg(shift)));
528
529 shifted.type = BRW_REGISTER_TYPE_B;
530 dst_reg f(this, glsl_type::vec4_type);
531 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
532
533 dst_reg scaled(this, glsl_type::vec4_type);
534 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
535
536 dst_reg max(this, glsl_type::vec4_type);
537 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
538 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
539 }
540
541 void
542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
543 {
544 dst_reg saturated(this, glsl_type::vec4_type);
545 vec4_instruction *inst = emit(MOV(saturated, src0));
546 inst->saturate = true;
547
548 dst_reg scaled(this, glsl_type::vec4_type);
549 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
550
551 dst_reg rounded(this, glsl_type::vec4_type);
552 emit(RNDE(rounded, src_reg(scaled)));
553
554 dst_reg u(this, glsl_type::uvec4_type);
555 emit(MOV(u, src_reg(rounded)));
556
557 src_reg bytes(u);
558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560
561 void
562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
563 {
564 dst_reg max(this, glsl_type::vec4_type);
565 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
566
567 dst_reg min(this, glsl_type::vec4_type);
568 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
569
570 dst_reg scaled(this, glsl_type::vec4_type);
571 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
572
573 dst_reg rounded(this, glsl_type::vec4_type);
574 emit(RNDE(rounded, src_reg(scaled)));
575
576 dst_reg i(this, glsl_type::ivec4_type);
577 emit(MOV(i, src_reg(rounded)));
578
579 src_reg bytes(i);
580 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
581 }
582
583 void
584 vec4_visitor::visit_instructions(const exec_list *list)
585 {
586 foreach_in_list(ir_instruction, ir, list) {
587 base_ir = ir;
588 ir->accept(this);
589 }
590 }
591
592 /**
593 * Returns the minimum number of vec4 elements needed to pack a type.
594 *
595 * For simple types, it will return 1 (a single vec4); for matrices, the
596 * number of columns; for array and struct, the sum of the vec4_size of
597 * each of its elements; and for sampler and atomic, zero.
598 *
599 * This method is useful to calculate how much register space is needed to
600 * store a particular type.
601 */
602 extern "C" int
603 type_size_vec4(const struct glsl_type *type)
604 {
605 unsigned int i;
606 int size;
607
608 switch (type->base_type) {
609 case GLSL_TYPE_UINT:
610 case GLSL_TYPE_INT:
611 case GLSL_TYPE_FLOAT:
612 case GLSL_TYPE_BOOL:
613 if (type->is_matrix()) {
614 return type->matrix_columns;
615 } else {
616 /* Regardless of size of vector, it gets a vec4. This is bad
617 * packing for things like floats, but otherwise arrays become a
618 * mess. Hopefully a later pass over the code can pack scalars
619 * down if appropriate.
620 */
621 return 1;
622 }
623 case GLSL_TYPE_ARRAY:
624 assert(type->length > 0);
625 return type_size_vec4(type->fields.array) * type->length;
626 case GLSL_TYPE_STRUCT:
627 size = 0;
628 for (i = 0; i < type->length; i++) {
629 size += type_size_vec4(type->fields.structure[i].type);
630 }
631 return size;
632 case GLSL_TYPE_SUBROUTINE:
633 return 1;
634
635 case GLSL_TYPE_SAMPLER:
636 /* Samplers take up no register space, since they're baked in at
637 * link time.
638 */
639 return 0;
640 case GLSL_TYPE_ATOMIC_UINT:
641 return 0;
642 case GLSL_TYPE_IMAGE:
643 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
644 case GLSL_TYPE_VOID:
645 case GLSL_TYPE_DOUBLE:
646 case GLSL_TYPE_ERROR:
647 case GLSL_TYPE_INTERFACE:
648 unreachable("not reached");
649 }
650
651 return 0;
652 }
653
654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size_vec4(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->swizzle = BRW_SWIZZLE_NOOP;
663 } else {
664 this->swizzle = brw_swizzle_for_size(type->vector_elements);
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
671 {
672 assert(size > 0);
673
674 init();
675
676 this->file = GRF;
677 this->reg = v->alloc.allocate(type_size_vec4(type) * size);
678
679 this->swizzle = BRW_SWIZZLE_NOOP;
680
681 this->type = brw_type_for_base_type(type);
682 }
683
684 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
685 {
686 init();
687
688 this->file = GRF;
689 this->reg = v->alloc.allocate(type_size_vec4(type));
690
691 if (type->is_array() || type->is_record()) {
692 this->writemask = WRITEMASK_XYZW;
693 } else {
694 this->writemask = (1 << type->vector_elements) - 1;
695 }
696
697 this->type = brw_type_for_base_type(type);
698 }
699
700 void
701 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
702 const gl_constant_value *values,
703 unsigned n)
704 {
705 static const gl_constant_value zero = { 0 };
706
707 assert(param_offset % 4 == 0);
708
709 for (unsigned i = 0; i < n; ++i)
710 stage_prog_data->param[param_offset + i] = &values[i];
711
712 for (unsigned i = n; i < 4; ++i)
713 stage_prog_data->param[param_offset + i] = &zero;
714
715 uniform_vector_size[param_offset / 4] = n;
716 }
717
718 /* Our support for uniforms is piggy-backed on the struct
719 * gl_fragment_program, because that's where the values actually
720 * get stored, rather than in some global gl_shader_program uniform
721 * store.
722 */
723 void
724 vec4_visitor::setup_uniform_values(ir_variable *ir)
725 {
726 int namelen = strlen(ir->name);
727
728 /* The data for our (non-builtin) uniforms is stored in a series of
729 * gl_uniform_driver_storage structs for each subcomponent that
730 * glGetUniformLocation() could name. We know it's been set up in the same
731 * order we'd walk the type, so walk the list of storage and find anything
732 * with our name, or the prefix of a component that starts with our name.
733 */
734 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
735 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
736
737 if (storage->builtin)
738 continue;
739
740 if (strncmp(ir->name, storage->name, namelen) != 0 ||
741 (storage->name[namelen] != 0 &&
742 storage->name[namelen] != '.' &&
743 storage->name[namelen] != '[')) {
744 continue;
745 }
746
747 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
748 storage->type->matrix_columns);
749 const unsigned vector_size = storage->type->vector_elements;
750
751 for (unsigned s = 0; s < vector_count; s++) {
752 setup_vec4_uniform_value(uniforms * 4,
753 &storage->storage[s * vector_size],
754 vector_size);
755 uniforms++;
756 }
757 }
758 }
759
760 /* Our support for builtin uniforms is even scarier than non-builtin.
761 * It sits on top of the PROG_STATE_VAR parameters that are
762 * automatically updated from GL context state.
763 */
764 void
765 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
766 {
767 const ir_state_slot *const slots = ir->get_state_slots();
768 assert(slots != NULL);
769
770 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
771 /* This state reference has already been setup by ir_to_mesa,
772 * but we'll get the same index back here. We can reference
773 * ParameterValues directly, since unlike brw_fs.cpp, we never
774 * add new state references during compile.
775 */
776 int index = _mesa_add_state_reference(this->prog->Parameters,
777 (gl_state_index *)slots[i].tokens);
778 gl_constant_value *values =
779 &this->prog->Parameters->ParameterValues[index][0];
780
781 assert(this->uniforms < uniform_array_size);
782
783 for (unsigned j = 0; j < 4; j++)
784 stage_prog_data->param[this->uniforms * 4 + j] =
785 &values[GET_SWZ(slots[i].swizzle, j)];
786
787 this->uniform_vector_size[this->uniforms] =
788 (ir->type->is_scalar() || ir->type->is_vector() ||
789 ir->type->is_matrix() ? ir->type->vector_elements : 4);
790
791 this->uniforms++;
792 }
793 }
794
795 dst_reg *
796 vec4_visitor::variable_storage(ir_variable *var)
797 {
798 return (dst_reg *)hash_table_find(this->variable_ht, var);
799 }
800
801 void
802 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
803 enum brw_predicate *predicate)
804 {
805 ir_expression *expr = ir->as_expression();
806
807 *predicate = BRW_PREDICATE_NORMAL;
808
809 if (expr && expr->operation != ir_binop_ubo_load) {
810 src_reg op[3];
811 vec4_instruction *inst;
812
813 assert(expr->get_num_operands() <= 3);
814 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
815 expr->operands[i]->accept(this);
816 op[i] = this->result;
817
818 resolve_ud_negate(&op[i]);
819 }
820
821 switch (expr->operation) {
822 case ir_unop_logic_not:
823 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
824 inst->conditional_mod = BRW_CONDITIONAL_Z;
825 break;
826
827 case ir_binop_logic_xor:
828 if (devinfo->gen <= 5) {
829 src_reg temp = src_reg(this, ir->type);
830 emit(XOR(dst_reg(temp), op[0], op[1]));
831 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
832 } else {
833 inst = emit(XOR(dst_null_d(), op[0], op[1]));
834 }
835 inst->conditional_mod = BRW_CONDITIONAL_NZ;
836 break;
837
838 case ir_binop_logic_or:
839 if (devinfo->gen <= 5) {
840 src_reg temp = src_reg(this, ir->type);
841 emit(OR(dst_reg(temp), op[0], op[1]));
842 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
843 } else {
844 inst = emit(OR(dst_null_d(), op[0], op[1]));
845 }
846 inst->conditional_mod = BRW_CONDITIONAL_NZ;
847 break;
848
849 case ir_binop_logic_and:
850 if (devinfo->gen <= 5) {
851 src_reg temp = src_reg(this, ir->type);
852 emit(AND(dst_reg(temp), op[0], op[1]));
853 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
854 } else {
855 inst = emit(AND(dst_null_d(), op[0], op[1]));
856 }
857 inst->conditional_mod = BRW_CONDITIONAL_NZ;
858 break;
859
860 case ir_unop_f2b:
861 if (devinfo->gen >= 6) {
862 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
863 } else {
864 inst = emit(MOV(dst_null_f(), op[0]));
865 inst->conditional_mod = BRW_CONDITIONAL_NZ;
866 }
867 break;
868
869 case ir_unop_i2b:
870 if (devinfo->gen >= 6) {
871 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
872 } else {
873 inst = emit(MOV(dst_null_d(), op[0]));
874 inst->conditional_mod = BRW_CONDITIONAL_NZ;
875 }
876 break;
877
878 case ir_binop_all_equal:
879 if (devinfo->gen <= 5) {
880 resolve_bool_comparison(expr->operands[0], &op[0]);
881 resolve_bool_comparison(expr->operands[1], &op[1]);
882 }
883 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
884 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
885 break;
886
887 case ir_binop_any_nequal:
888 if (devinfo->gen <= 5) {
889 resolve_bool_comparison(expr->operands[0], &op[0]);
890 resolve_bool_comparison(expr->operands[1], &op[1]);
891 }
892 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
893 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
894 break;
895
896 case ir_unop_any:
897 if (devinfo->gen <= 5) {
898 resolve_bool_comparison(expr->operands[0], &op[0]);
899 }
900 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
901 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
902 break;
903
904 case ir_binop_greater:
905 case ir_binop_gequal:
906 case ir_binop_less:
907 case ir_binop_lequal:
908 case ir_binop_equal:
909 case ir_binop_nequal:
910 if (devinfo->gen <= 5) {
911 resolve_bool_comparison(expr->operands[0], &op[0]);
912 resolve_bool_comparison(expr->operands[1], &op[1]);
913 }
914 emit(CMP(dst_null_d(), op[0], op[1],
915 brw_conditional_for_comparison(expr->operation)));
916 break;
917
918 case ir_triop_csel: {
919 /* Expand the boolean condition into the flag register. */
920 inst = emit(MOV(dst_null_d(), op[0]));
921 inst->conditional_mod = BRW_CONDITIONAL_NZ;
922
923 /* Select which boolean to return. */
924 dst_reg temp(this, expr->operands[1]->type);
925 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
926 inst->predicate = BRW_PREDICATE_NORMAL;
927
928 /* Expand the result to a condition code. */
929 inst = emit(MOV(dst_null_d(), src_reg(temp)));
930 inst->conditional_mod = BRW_CONDITIONAL_NZ;
931 break;
932 }
933
934 default:
935 unreachable("not reached");
936 }
937 return;
938 }
939
940 ir->accept(this);
941
942 resolve_ud_negate(&this->result);
943
944 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
945 inst->conditional_mod = BRW_CONDITIONAL_NZ;
946 }
947
948 /**
949 * Emit a gen6 IF statement with the comparison folded into the IF
950 * instruction.
951 */
952 void
953 vec4_visitor::emit_if_gen6(ir_if *ir)
954 {
955 ir_expression *expr = ir->condition->as_expression();
956
957 if (expr && expr->operation != ir_binop_ubo_load) {
958 src_reg op[3];
959 dst_reg temp;
960
961 assert(expr->get_num_operands() <= 3);
962 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
963 expr->operands[i]->accept(this);
964 op[i] = this->result;
965 }
966
967 switch (expr->operation) {
968 case ir_unop_logic_not:
969 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
970 return;
971
972 case ir_binop_logic_xor:
973 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
974 return;
975
976 case ir_binop_logic_or:
977 temp = dst_reg(this, glsl_type::bool_type);
978 emit(OR(temp, op[0], op[1]));
979 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
980 return;
981
982 case ir_binop_logic_and:
983 temp = dst_reg(this, glsl_type::bool_type);
984 emit(AND(temp, op[0], op[1]));
985 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
986 return;
987
988 case ir_unop_f2b:
989 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
990 return;
991
992 case ir_unop_i2b:
993 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
994 return;
995
996 case ir_binop_greater:
997 case ir_binop_gequal:
998 case ir_binop_less:
999 case ir_binop_lequal:
1000 case ir_binop_equal:
1001 case ir_binop_nequal:
1002 emit(IF(op[0], op[1],
1003 brw_conditional_for_comparison(expr->operation)));
1004 return;
1005
1006 case ir_binop_all_equal:
1007 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1008 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1009 return;
1010
1011 case ir_binop_any_nequal:
1012 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1013 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1014 return;
1015
1016 case ir_unop_any:
1017 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1018 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1019 return;
1020
1021 case ir_triop_csel: {
1022 /* Expand the boolean condition into the flag register. */
1023 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1024 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1025
1026 /* Select which boolean to return. */
1027 dst_reg temp(this, expr->operands[1]->type);
1028 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1029 inst->predicate = BRW_PREDICATE_NORMAL;
1030
1031 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1032 return;
1033 }
1034
1035 default:
1036 unreachable("not reached");
1037 }
1038 return;
1039 }
1040
1041 ir->condition->accept(this);
1042
1043 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1044 }
1045
1046 void
1047 vec4_visitor::visit(ir_variable *ir)
1048 {
1049 dst_reg *reg = NULL;
1050
1051 if (variable_storage(ir))
1052 return;
1053
1054 switch (ir->data.mode) {
1055 case ir_var_shader_in:
1056 assert(ir->data.location != -1);
1057 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1058 break;
1059
1060 case ir_var_shader_out:
1061 assert(ir->data.location != -1);
1062 reg = new(mem_ctx) dst_reg(this, ir->type);
1063
1064 for (int i = 0; i < type_size_vec4(ir->type); i++) {
1065 output_reg[ir->data.location + i] = *reg;
1066 output_reg[ir->data.location + i].reg_offset = i;
1067 output_reg_annotation[ir->data.location + i] = ir->name;
1068 }
1069 break;
1070
1071 case ir_var_auto:
1072 case ir_var_temporary:
1073 reg = new(mem_ctx) dst_reg(this, ir->type);
1074 break;
1075
1076 case ir_var_uniform:
1077 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1078
1079 /* Thanks to the lower_ubo_reference pass, we will see only
1080 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1081 * variables, so no need for them to be in variable_ht.
1082 *
1083 * Some uniforms, such as samplers and atomic counters, have no actual
1084 * storage, so we should ignore them.
1085 */
1086 if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1087 return;
1088
1089 /* Track how big the whole uniform variable is, in case we need to put a
1090 * copy of its data into pull constants for array access.
1091 */
1092 assert(this->uniforms < uniform_array_size);
1093 this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1094
1095 if (!strncmp(ir->name, "gl_", 3)) {
1096 setup_builtin_uniform_values(ir);
1097 } else {
1098 setup_uniform_values(ir);
1099 }
1100 break;
1101
1102 case ir_var_system_value:
1103 reg = make_reg_for_system_value(ir->data.location, ir->type);
1104 break;
1105
1106 default:
1107 unreachable("not reached");
1108 }
1109
1110 reg->type = brw_type_for_base_type(ir->type);
1111 hash_table_insert(this->variable_ht, reg, ir);
1112 }
1113
1114 void
1115 vec4_visitor::visit(ir_loop *ir)
1116 {
1117 /* We don't want debugging output to print the whole body of the
1118 * loop as the annotation.
1119 */
1120 this->base_ir = NULL;
1121
1122 emit(BRW_OPCODE_DO);
1123
1124 visit_instructions(&ir->body_instructions);
1125
1126 emit(BRW_OPCODE_WHILE);
1127 }
1128
1129 void
1130 vec4_visitor::visit(ir_loop_jump *ir)
1131 {
1132 switch (ir->mode) {
1133 case ir_loop_jump::jump_break:
1134 emit(BRW_OPCODE_BREAK);
1135 break;
1136 case ir_loop_jump::jump_continue:
1137 emit(BRW_OPCODE_CONTINUE);
1138 break;
1139 }
1140 }
1141
1142
1143 void
1144 vec4_visitor::visit(ir_function_signature *)
1145 {
1146 unreachable("not reached");
1147 }
1148
1149 void
1150 vec4_visitor::visit(ir_function *ir)
1151 {
1152 /* Ignore function bodies other than main() -- we shouldn't see calls to
1153 * them since they should all be inlined.
1154 */
1155 if (strcmp(ir->name, "main") == 0) {
1156 const ir_function_signature *sig;
1157 exec_list empty;
1158
1159 sig = ir->matching_signature(NULL, &empty, false);
1160
1161 assert(sig);
1162
1163 visit_instructions(&sig->body);
1164 }
1165 }
1166
1167 bool
1168 vec4_visitor::try_emit_mad(ir_expression *ir)
1169 {
1170 /* 3-src instructions were introduced in gen6. */
1171 if (devinfo->gen < 6)
1172 return false;
1173
1174 /* MAD can only handle floating-point data. */
1175 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1176 return false;
1177
1178 ir_rvalue *nonmul;
1179 ir_expression *mul;
1180 bool mul_negate, mul_abs;
1181
1182 for (int i = 0; i < 2; i++) {
1183 mul_negate = false;
1184 mul_abs = false;
1185
1186 mul = ir->operands[i]->as_expression();
1187 nonmul = ir->operands[1 - i];
1188
1189 if (mul && mul->operation == ir_unop_abs) {
1190 mul = mul->operands[0]->as_expression();
1191 mul_abs = true;
1192 } else if (mul && mul->operation == ir_unop_neg) {
1193 mul = mul->operands[0]->as_expression();
1194 mul_negate = true;
1195 }
1196
1197 if (mul && mul->operation == ir_binop_mul)
1198 break;
1199 }
1200
1201 if (!mul || mul->operation != ir_binop_mul)
1202 return false;
1203
1204 nonmul->accept(this);
1205 src_reg src0 = fix_3src_operand(this->result);
1206
1207 mul->operands[0]->accept(this);
1208 src_reg src1 = fix_3src_operand(this->result);
1209 src1.negate ^= mul_negate;
1210 src1.abs = mul_abs;
1211 if (mul_abs)
1212 src1.negate = false;
1213
1214 mul->operands[1]->accept(this);
1215 src_reg src2 = fix_3src_operand(this->result);
1216 src2.abs = mul_abs;
1217 if (mul_abs)
1218 src2.negate = false;
1219
1220 this->result = src_reg(this, ir->type);
1221 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1222
1223 return true;
1224 }
1225
1226 bool
1227 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1228 {
1229 /* This optimization relies on CMP setting the destination to 0 when
1230 * false. Early hardware only sets the least significant bit, and
1231 * leaves the other bits undefined. So we can't use it.
1232 */
1233 if (devinfo->gen < 6)
1234 return false;
1235
1236 ir_expression *const cmp = ir->operands[0]->as_expression();
1237
1238 if (cmp == NULL)
1239 return false;
1240
1241 switch (cmp->operation) {
1242 case ir_binop_less:
1243 case ir_binop_greater:
1244 case ir_binop_lequal:
1245 case ir_binop_gequal:
1246 case ir_binop_equal:
1247 case ir_binop_nequal:
1248 break;
1249
1250 default:
1251 return false;
1252 }
1253
1254 cmp->operands[0]->accept(this);
1255 const src_reg cmp_src0 = this->result;
1256
1257 cmp->operands[1]->accept(this);
1258 const src_reg cmp_src1 = this->result;
1259
1260 this->result = src_reg(this, ir->type);
1261
1262 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1263 brw_conditional_for_comparison(cmp->operation)));
1264
1265 /* If the comparison is false, this->result will just happen to be zero.
1266 */
1267 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1268 this->result, src_reg(1.0f));
1269 inst->predicate = BRW_PREDICATE_NORMAL;
1270 inst->predicate_inverse = true;
1271
1272 return true;
1273 }
1274
1275 vec4_instruction *
1276 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1277 src_reg src0, src_reg src1)
1278 {
1279 vec4_instruction *inst;
1280
1281 if (devinfo->gen >= 6) {
1282 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1283 inst->conditional_mod = conditionalmod;
1284 } else {
1285 emit(CMP(dst, src0, src1, conditionalmod));
1286
1287 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1288 inst->predicate = BRW_PREDICATE_NORMAL;
1289 }
1290
1291 return inst;
1292 }
1293
1294 vec4_instruction *
1295 vec4_visitor::emit_lrp(const dst_reg &dst,
1296 const src_reg &x, const src_reg &y, const src_reg &a)
1297 {
1298 if (devinfo->gen >= 6) {
1299 /* Note that the instruction's argument order is reversed from GLSL
1300 * and the IR.
1301 */
1302 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1303 fix_3src_operand(x)));
1304 } else {
1305 /* Earlier generations don't support three source operations, so we
1306 * need to emit x*(1-a) + y*a.
1307 */
1308 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1309 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1310 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1311 y_times_a.writemask = dst.writemask;
1312 one_minus_a.writemask = dst.writemask;
1313 x_times_one_minus_a.writemask = dst.writemask;
1314
1315 emit(MUL(y_times_a, y, a));
1316 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1317 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1318 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1319 }
1320 }
1321
1322 /**
1323 * Emits the instructions needed to perform a pull constant load. before_block
1324 * and before_inst can be NULL in which case the instruction will be appended
1325 * to the end of the instruction list.
1326 */
1327 void
1328 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1329 src_reg surf_index,
1330 src_reg offset_reg,
1331 bblock_t *before_block,
1332 vec4_instruction *before_inst)
1333 {
1334 assert((before_inst == NULL && before_block == NULL) ||
1335 (before_inst && before_block));
1336
1337 vec4_instruction *pull;
1338
1339 if (devinfo->gen >= 9) {
1340 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1341 src_reg header(this, glsl_type::uvec4_type, 2);
1342
1343 pull = new(mem_ctx)
1344 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1345 dst_reg(header));
1346
1347 if (before_inst)
1348 emit_before(before_block, before_inst, pull);
1349 else
1350 emit(pull);
1351
1352 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1353 offset_reg.type);
1354 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1355
1356 if (before_inst)
1357 emit_before(before_block, before_inst, pull);
1358 else
1359 emit(pull);
1360
1361 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1362 dst,
1363 surf_index,
1364 header);
1365 pull->mlen = 2;
1366 pull->header_size = 1;
1367 } else if (devinfo->gen >= 7) {
1368 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1369
1370 grf_offset.type = offset_reg.type;
1371
1372 pull = MOV(grf_offset, offset_reg);
1373
1374 if (before_inst)
1375 emit_before(before_block, before_inst, pull);
1376 else
1377 emit(pull);
1378
1379 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1380 dst,
1381 surf_index,
1382 src_reg(grf_offset));
1383 pull->mlen = 1;
1384 } else {
1385 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1386 dst,
1387 surf_index,
1388 offset_reg);
1389 pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
1390 pull->mlen = 1;
1391 }
1392
1393 if (before_inst)
1394 emit_before(before_block, before_inst, pull);
1395 else
1396 emit(pull);
1397 }
1398
1399 src_reg
1400 vec4_visitor::emit_uniformize(const src_reg &src)
1401 {
1402 const src_reg chan_index(this, glsl_type::uint_type);
1403 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1404 src.type);
1405
1406 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1407 ->force_writemask_all = true;
1408 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1409 ->force_writemask_all = true;
1410
1411 return src_reg(dst);
1412 }
1413
1414 void
1415 vec4_visitor::visit(ir_expression *ir)
1416 {
1417 unsigned int operand;
1418 src_reg op[ARRAY_SIZE(ir->operands)];
1419 vec4_instruction *inst;
1420
1421 if (ir->operation == ir_binop_add) {
1422 if (try_emit_mad(ir))
1423 return;
1424 }
1425
1426 if (ir->operation == ir_unop_b2f) {
1427 if (try_emit_b2f_of_compare(ir))
1428 return;
1429 }
1430
1431 /* Storage for our result. Ideally for an assignment we'd be using
1432 * the actual storage for the result here, instead.
1433 */
1434 dst_reg result_dst(this, ir->type);
1435 src_reg result_src(result_dst);
1436
1437 if (ir->operation == ir_triop_csel) {
1438 ir->operands[1]->accept(this);
1439 op[1] = this->result;
1440 ir->operands[2]->accept(this);
1441 op[2] = this->result;
1442
1443 enum brw_predicate predicate;
1444 emit_bool_to_cond_code(ir->operands[0], &predicate);
1445 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1446 inst->predicate = predicate;
1447 this->result = result_src;
1448 return;
1449 }
1450
1451 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1452 this->result.file = BAD_FILE;
1453 ir->operands[operand]->accept(this);
1454 if (this->result.file == BAD_FILE) {
1455 fprintf(stderr, "Failed to get tree for expression operand:\n");
1456 ir->operands[operand]->fprint(stderr);
1457 exit(1);
1458 }
1459 op[operand] = this->result;
1460
1461 /* Matrix expression operands should have been broken down to vector
1462 * operations already.
1463 */
1464 assert(!ir->operands[operand]->type->is_matrix());
1465 }
1466
1467 /* If nothing special happens, this is the result. */
1468 this->result = result_src;
1469
1470 switch (ir->operation) {
1471 case ir_unop_logic_not:
1472 emit(NOT(result_dst, op[0]));
1473 break;
1474 case ir_unop_neg:
1475 op[0].negate = !op[0].negate;
1476 emit(MOV(result_dst, op[0]));
1477 break;
1478 case ir_unop_abs:
1479 op[0].abs = true;
1480 op[0].negate = false;
1481 emit(MOV(result_dst, op[0]));
1482 break;
1483
1484 case ir_unop_sign:
1485 if (ir->type->is_float()) {
1486 /* AND(val, 0x80000000) gives the sign bit.
1487 *
1488 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1489 * zero.
1490 */
1491 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1492
1493 op[0].type = BRW_REGISTER_TYPE_UD;
1494 result_dst.type = BRW_REGISTER_TYPE_UD;
1495 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1496
1497 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1498 inst->predicate = BRW_PREDICATE_NORMAL;
1499
1500 this->result.type = BRW_REGISTER_TYPE_F;
1501 } else {
1502 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1503 * -> non-negative val generates 0x00000000.
1504 * Predicated OR sets 1 if val is positive.
1505 */
1506 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1507
1508 emit(ASR(result_dst, op[0], src_reg(31)));
1509
1510 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1511 inst->predicate = BRW_PREDICATE_NORMAL;
1512 }
1513 break;
1514
1515 case ir_unop_rcp:
1516 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1517 break;
1518
1519 case ir_unop_exp2:
1520 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1521 break;
1522 case ir_unop_log2:
1523 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1524 break;
1525 case ir_unop_exp:
1526 case ir_unop_log:
1527 unreachable("not reached: should be handled by ir_explog_to_explog2");
1528 case ir_unop_sin:
1529 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1530 break;
1531 case ir_unop_cos:
1532 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1533 break;
1534
1535 case ir_unop_dFdx:
1536 case ir_unop_dFdx_coarse:
1537 case ir_unop_dFdx_fine:
1538 case ir_unop_dFdy:
1539 case ir_unop_dFdy_coarse:
1540 case ir_unop_dFdy_fine:
1541 unreachable("derivatives not valid in vertex shader");
1542
1543 case ir_unop_bitfield_reverse:
1544 emit(BFREV(result_dst, op[0]));
1545 break;
1546 case ir_unop_bit_count:
1547 emit(CBIT(result_dst, op[0]));
1548 break;
1549 case ir_unop_find_msb: {
1550 src_reg temp = src_reg(this, glsl_type::uint_type);
1551
1552 inst = emit(FBH(dst_reg(temp), op[0]));
1553 inst->dst.writemask = WRITEMASK_XYZW;
1554
1555 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1556 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1557 * subtract the result from 31 to convert the MSB count into an LSB count.
1558 */
1559
1560 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1561 temp.swizzle = BRW_SWIZZLE_NOOP;
1562 emit(MOV(result_dst, temp));
1563
1564 src_reg src_tmp = src_reg(result_dst);
1565 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1566
1567 src_tmp.negate = true;
1568 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1569 inst->predicate = BRW_PREDICATE_NORMAL;
1570 break;
1571 }
1572 case ir_unop_find_lsb:
1573 emit(FBL(result_dst, op[0]));
1574 break;
1575 case ir_unop_saturate:
1576 inst = emit(MOV(result_dst, op[0]));
1577 inst->saturate = true;
1578 break;
1579
1580 case ir_unop_noise:
1581 unreachable("not reached: should be handled by lower_noise");
1582
1583 case ir_unop_subroutine_to_int:
1584 emit(MOV(result_dst, op[0]));
1585 break;
1586
1587 case ir_binop_add:
1588 emit(ADD(result_dst, op[0], op[1]));
1589 break;
1590 case ir_binop_sub:
1591 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1592
1593 case ir_binop_mul:
1594 if (devinfo->gen < 8 && ir->type->is_integer()) {
1595 /* For integer multiplication, the MUL uses the low 16 bits of one of
1596 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1597 * accumulates in the contribution of the upper 16 bits of that
1598 * operand. If we can determine that one of the args is in the low
1599 * 16 bits, though, we can just emit a single MUL.
1600 */
1601 if (ir->operands[0]->is_uint16_constant()) {
1602 if (devinfo->gen < 7)
1603 emit(MUL(result_dst, op[0], op[1]));
1604 else
1605 emit(MUL(result_dst, op[1], op[0]));
1606 } else if (ir->operands[1]->is_uint16_constant()) {
1607 if (devinfo->gen < 7)
1608 emit(MUL(result_dst, op[1], op[0]));
1609 else
1610 emit(MUL(result_dst, op[0], op[1]));
1611 } else {
1612 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1613
1614 emit(MUL(acc, op[0], op[1]));
1615 emit(MACH(dst_null_d(), op[0], op[1]));
1616 emit(MOV(result_dst, src_reg(acc)));
1617 }
1618 } else {
1619 emit(MUL(result_dst, op[0], op[1]));
1620 }
1621 break;
1622 case ir_binop_imul_high: {
1623 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1624
1625 emit(MUL(acc, op[0], op[1]));
1626 emit(MACH(result_dst, op[0], op[1]));
1627 break;
1628 }
1629 case ir_binop_div:
1630 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1631 assert(ir->type->is_integer());
1632 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1633 break;
1634
1635 case ir_binop_carry:
1636 unreachable("Should have been lowered by carry_to_arith().");
1637
1638 case ir_binop_borrow:
1639 unreachable("Should have been lowered by borrow_to_arith().");
1640
1641 case ir_binop_mod:
1642 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1643 assert(ir->type->is_integer());
1644 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1645 break;
1646
1647 case ir_binop_less:
1648 case ir_binop_greater:
1649 case ir_binop_lequal:
1650 case ir_binop_gequal:
1651 case ir_binop_equal:
1652 case ir_binop_nequal: {
1653 if (devinfo->gen <= 5) {
1654 resolve_bool_comparison(ir->operands[0], &op[0]);
1655 resolve_bool_comparison(ir->operands[1], &op[1]);
1656 }
1657 emit(CMP(result_dst, op[0], op[1],
1658 brw_conditional_for_comparison(ir->operation)));
1659 break;
1660 }
1661
1662 case ir_binop_all_equal:
1663 if (devinfo->gen <= 5) {
1664 resolve_bool_comparison(ir->operands[0], &op[0]);
1665 resolve_bool_comparison(ir->operands[1], &op[1]);
1666 }
1667
1668 /* "==" operator producing a scalar boolean. */
1669 if (ir->operands[0]->type->is_vector() ||
1670 ir->operands[1]->type->is_vector()) {
1671 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1672 emit(MOV(result_dst, src_reg(0)));
1673 inst = emit(MOV(result_dst, src_reg(~0)));
1674 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1675 } else {
1676 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1677 }
1678 break;
1679 case ir_binop_any_nequal:
1680 if (devinfo->gen <= 5) {
1681 resolve_bool_comparison(ir->operands[0], &op[0]);
1682 resolve_bool_comparison(ir->operands[1], &op[1]);
1683 }
1684
1685 /* "!=" operator producing a scalar boolean. */
1686 if (ir->operands[0]->type->is_vector() ||
1687 ir->operands[1]->type->is_vector()) {
1688 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1689
1690 emit(MOV(result_dst, src_reg(0)));
1691 inst = emit(MOV(result_dst, src_reg(~0)));
1692 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1693 } else {
1694 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1695 }
1696 break;
1697
1698 case ir_unop_any:
1699 if (devinfo->gen <= 5) {
1700 resolve_bool_comparison(ir->operands[0], &op[0]);
1701 }
1702 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1703 emit(MOV(result_dst, src_reg(0)));
1704
1705 inst = emit(MOV(result_dst, src_reg(~0)));
1706 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1707 break;
1708
1709 case ir_binop_logic_xor:
1710 emit(XOR(result_dst, op[0], op[1]));
1711 break;
1712
1713 case ir_binop_logic_or:
1714 emit(OR(result_dst, op[0], op[1]));
1715 break;
1716
1717 case ir_binop_logic_and:
1718 emit(AND(result_dst, op[0], op[1]));
1719 break;
1720
1721 case ir_binop_dot:
1722 assert(ir->operands[0]->type->is_vector());
1723 assert(ir->operands[0]->type == ir->operands[1]->type);
1724 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1725 break;
1726
1727 case ir_unop_sqrt:
1728 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1729 break;
1730 case ir_unop_rsq:
1731 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1732 break;
1733
1734 case ir_unop_bitcast_i2f:
1735 case ir_unop_bitcast_u2f:
1736 this->result = op[0];
1737 this->result.type = BRW_REGISTER_TYPE_F;
1738 break;
1739
1740 case ir_unop_bitcast_f2i:
1741 this->result = op[0];
1742 this->result.type = BRW_REGISTER_TYPE_D;
1743 break;
1744
1745 case ir_unop_bitcast_f2u:
1746 this->result = op[0];
1747 this->result.type = BRW_REGISTER_TYPE_UD;
1748 break;
1749
1750 case ir_unop_i2f:
1751 case ir_unop_i2u:
1752 case ir_unop_u2i:
1753 case ir_unop_u2f:
1754 case ir_unop_f2i:
1755 case ir_unop_f2u:
1756 emit(MOV(result_dst, op[0]));
1757 break;
1758 case ir_unop_b2i:
1759 case ir_unop_b2f:
1760 if (devinfo->gen <= 5) {
1761 resolve_bool_comparison(ir->operands[0], &op[0]);
1762 }
1763 emit(MOV(result_dst, negate(op[0])));
1764 break;
1765 case ir_unop_f2b:
1766 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1767 break;
1768 case ir_unop_i2b:
1769 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1770 break;
1771
1772 case ir_unop_trunc:
1773 emit(RNDZ(result_dst, op[0]));
1774 break;
1775 case ir_unop_ceil: {
1776 src_reg tmp = src_reg(this, ir->type);
1777 op[0].negate = !op[0].negate;
1778 emit(RNDD(dst_reg(tmp), op[0]));
1779 tmp.negate = true;
1780 emit(MOV(result_dst, tmp));
1781 }
1782 break;
1783 case ir_unop_floor:
1784 inst = emit(RNDD(result_dst, op[0]));
1785 break;
1786 case ir_unop_fract:
1787 inst = emit(FRC(result_dst, op[0]));
1788 break;
1789 case ir_unop_round_even:
1790 emit(RNDE(result_dst, op[0]));
1791 break;
1792
1793 case ir_binop_min:
1794 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1795 break;
1796 case ir_binop_max:
1797 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1798 break;
1799
1800 case ir_binop_pow:
1801 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1802 break;
1803
1804 case ir_unop_bit_not:
1805 inst = emit(NOT(result_dst, op[0]));
1806 break;
1807 case ir_binop_bit_and:
1808 inst = emit(AND(result_dst, op[0], op[1]));
1809 break;
1810 case ir_binop_bit_xor:
1811 inst = emit(XOR(result_dst, op[0], op[1]));
1812 break;
1813 case ir_binop_bit_or:
1814 inst = emit(OR(result_dst, op[0], op[1]));
1815 break;
1816
1817 case ir_binop_lshift:
1818 inst = emit(SHL(result_dst, op[0], op[1]));
1819 break;
1820
1821 case ir_binop_rshift:
1822 if (ir->type->base_type == GLSL_TYPE_INT)
1823 inst = emit(ASR(result_dst, op[0], op[1]));
1824 else
1825 inst = emit(SHR(result_dst, op[0], op[1]));
1826 break;
1827
1828 case ir_binop_bfm:
1829 emit(BFI1(result_dst, op[0], op[1]));
1830 break;
1831
1832 case ir_binop_ubo_load: {
1833 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1834 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1835 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1836 src_reg offset;
1837
1838 /* Now, load the vector from that offset. */
1839 assert(ir->type->is_vector() || ir->type->is_scalar());
1840
1841 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1842 packed_consts.type = result.type;
1843 src_reg surf_index;
1844
1845 if (const_uniform_block) {
1846 /* The block index is a constant, so just emit the binding table entry
1847 * as an immediate.
1848 */
1849 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1850 const_uniform_block->value.u[0]);
1851 } else {
1852 /* The block index is not a constant. Evaluate the index expression
1853 * per-channel and add the base UBO index; we have to select a value
1854 * from any live channel.
1855 */
1856 surf_index = src_reg(this, glsl_type::uint_type);
1857 emit(ADD(dst_reg(surf_index), op[0],
1858 src_reg(prog_data->base.binding_table.ubo_start)));
1859 surf_index = emit_uniformize(surf_index);
1860
1861 /* Assume this may touch any UBO. It would be nice to provide
1862 * a tighter bound, but the array information is already lowered away.
1863 */
1864 brw_mark_surface_used(&prog_data->base,
1865 prog_data->base.binding_table.ubo_start +
1866 shader_prog->NumUniformBlocks - 1);
1867 }
1868
1869 if (const_offset_ir) {
1870 if (devinfo->gen >= 8) {
1871 /* Store the offset in a GRF so we can send-from-GRF. */
1872 offset = src_reg(this, glsl_type::int_type);
1873 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1874 } else {
1875 /* Immediates are fine on older generations since they'll be moved
1876 * to a (potentially fake) MRF at the generator level.
1877 */
1878 offset = src_reg(const_offset / 16);
1879 }
1880 } else {
1881 offset = src_reg(this, glsl_type::uint_type);
1882 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1883 }
1884
1885 emit_pull_constant_load_reg(dst_reg(packed_consts),
1886 surf_index,
1887 offset,
1888 NULL, NULL /* before_block/inst */);
1889
1890 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1891 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1892 const_offset % 16 / 4,
1893 const_offset % 16 / 4,
1894 const_offset % 16 / 4);
1895
1896 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1897 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1898 emit(CMP(result_dst, packed_consts, src_reg(0u),
1899 BRW_CONDITIONAL_NZ));
1900 } else {
1901 emit(MOV(result_dst, packed_consts));
1902 }
1903 break;
1904 }
1905
1906 case ir_binop_vector_extract:
1907 unreachable("should have been lowered by vec_index_to_cond_assign");
1908
1909 case ir_triop_fma:
1910 op[0] = fix_3src_operand(op[0]);
1911 op[1] = fix_3src_operand(op[1]);
1912 op[2] = fix_3src_operand(op[2]);
1913 /* Note that the instruction's argument order is reversed from GLSL
1914 * and the IR.
1915 */
1916 emit(MAD(result_dst, op[2], op[1], op[0]));
1917 break;
1918
1919 case ir_triop_lrp:
1920 emit_lrp(result_dst, op[0], op[1], op[2]);
1921 break;
1922
1923 case ir_triop_csel:
1924 unreachable("already handled above");
1925 break;
1926
1927 case ir_triop_bfi:
1928 op[0] = fix_3src_operand(op[0]);
1929 op[1] = fix_3src_operand(op[1]);
1930 op[2] = fix_3src_operand(op[2]);
1931 emit(BFI2(result_dst, op[0], op[1], op[2]));
1932 break;
1933
1934 case ir_triop_bitfield_extract:
1935 op[0] = fix_3src_operand(op[0]);
1936 op[1] = fix_3src_operand(op[1]);
1937 op[2] = fix_3src_operand(op[2]);
1938 /* Note that the instruction's argument order is reversed from GLSL
1939 * and the IR.
1940 */
1941 emit(BFE(result_dst, op[2], op[1], op[0]));
1942 break;
1943
1944 case ir_triop_vector_insert:
1945 unreachable("should have been lowered by lower_vector_insert");
1946
1947 case ir_quadop_bitfield_insert:
1948 unreachable("not reached: should be handled by "
1949 "bitfield_insert_to_bfm_bfi\n");
1950
1951 case ir_quadop_vector:
1952 unreachable("not reached: should be handled by lower_quadop_vector");
1953
1954 case ir_unop_pack_half_2x16:
1955 emit_pack_half_2x16(result_dst, op[0]);
1956 break;
1957 case ir_unop_unpack_half_2x16:
1958 emit_unpack_half_2x16(result_dst, op[0]);
1959 break;
1960 case ir_unop_unpack_unorm_4x8:
1961 emit_unpack_unorm_4x8(result_dst, op[0]);
1962 break;
1963 case ir_unop_unpack_snorm_4x8:
1964 emit_unpack_snorm_4x8(result_dst, op[0]);
1965 break;
1966 case ir_unop_pack_unorm_4x8:
1967 emit_pack_unorm_4x8(result_dst, op[0]);
1968 break;
1969 case ir_unop_pack_snorm_4x8:
1970 emit_pack_snorm_4x8(result_dst, op[0]);
1971 break;
1972 case ir_unop_pack_snorm_2x16:
1973 case ir_unop_pack_unorm_2x16:
1974 case ir_unop_unpack_snorm_2x16:
1975 case ir_unop_unpack_unorm_2x16:
1976 unreachable("not reached: should be handled by lower_packing_builtins");
1977 case ir_unop_unpack_half_2x16_split_x:
1978 case ir_unop_unpack_half_2x16_split_y:
1979 case ir_binop_pack_half_2x16_split:
1980 case ir_unop_interpolate_at_centroid:
1981 case ir_binop_interpolate_at_sample:
1982 case ir_binop_interpolate_at_offset:
1983 unreachable("not reached: should not occur in vertex shader");
1984 case ir_binop_ldexp:
1985 unreachable("not reached: should be handled by ldexp_to_arith()");
1986 case ir_unop_d2f:
1987 case ir_unop_f2d:
1988 case ir_unop_d2i:
1989 case ir_unop_i2d:
1990 case ir_unop_d2u:
1991 case ir_unop_u2d:
1992 case ir_unop_d2b:
1993 case ir_unop_pack_double_2x32:
1994 case ir_unop_unpack_double_2x32:
1995 case ir_unop_frexp_sig:
1996 case ir_unop_frexp_exp:
1997 unreachable("fp64 todo");
1998 }
1999 }
2000
2001
2002 void
2003 vec4_visitor::visit(ir_swizzle *ir)
2004 {
2005 /* Note that this is only swizzles in expressions, not those on the left
2006 * hand side of an assignment, which do write masking. See ir_assignment
2007 * for that.
2008 */
2009 const unsigned swz = brw_compose_swizzle(
2010 brw_swizzle_for_size(ir->type->vector_elements),
2011 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2012
2013 ir->val->accept(this);
2014 this->result = swizzle(this->result, swz);
2015 }
2016
2017 void
2018 vec4_visitor::visit(ir_dereference_variable *ir)
2019 {
2020 const struct glsl_type *type = ir->type;
2021 dst_reg *reg = variable_storage(ir->var);
2022
2023 if (!reg) {
2024 fail("Failed to find variable storage for %s\n", ir->var->name);
2025 this->result = src_reg(brw_null_reg());
2026 return;
2027 }
2028
2029 this->result = src_reg(*reg);
2030
2031 /* System values get their swizzle from the dst_reg writemask */
2032 if (ir->var->data.mode == ir_var_system_value)
2033 return;
2034
2035 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2036 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2037 }
2038
2039
2040 int
2041 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2042 {
2043 /* Under normal circumstances array elements are stored consecutively, so
2044 * the stride is equal to the size of the array element.
2045 */
2046 return type_size_vec4(ir->type);
2047 }
2048
2049
2050 void
2051 vec4_visitor::visit(ir_dereference_array *ir)
2052 {
2053 ir_constant *constant_index;
2054 src_reg src;
2055 int array_stride = compute_array_stride(ir);
2056
2057 constant_index = ir->array_index->constant_expression_value();
2058
2059 ir->array->accept(this);
2060 src = this->result;
2061
2062 if (constant_index) {
2063 src.reg_offset += constant_index->value.i[0] * array_stride;
2064 } else {
2065 /* Variable index array dereference. It eats the "vec4" of the
2066 * base of the array and an index that offsets the Mesa register
2067 * index.
2068 */
2069 ir->array_index->accept(this);
2070
2071 src_reg index_reg;
2072
2073 if (array_stride == 1) {
2074 index_reg = this->result;
2075 } else {
2076 index_reg = src_reg(this, glsl_type::int_type);
2077
2078 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2079 }
2080
2081 if (src.reladdr) {
2082 src_reg temp = src_reg(this, glsl_type::int_type);
2083
2084 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2085
2086 index_reg = temp;
2087 }
2088
2089 src.reladdr = ralloc(mem_ctx, src_reg);
2090 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2091 }
2092
2093 /* If the type is smaller than a vec4, replicate the last channel out. */
2094 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2095 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2096 else
2097 src.swizzle = BRW_SWIZZLE_NOOP;
2098 src.type = brw_type_for_base_type(ir->type);
2099
2100 this->result = src;
2101 }
2102
2103 void
2104 vec4_visitor::visit(ir_dereference_record *ir)
2105 {
2106 unsigned int i;
2107 const glsl_type *struct_type = ir->record->type;
2108 int offset = 0;
2109
2110 ir->record->accept(this);
2111
2112 for (i = 0; i < struct_type->length; i++) {
2113 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2114 break;
2115 offset += type_size_vec4(struct_type->fields.structure[i].type);
2116 }
2117
2118 /* If the type is smaller than a vec4, replicate the last channel out. */
2119 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2120 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2121 else
2122 this->result.swizzle = BRW_SWIZZLE_NOOP;
2123 this->result.type = brw_type_for_base_type(ir->type);
2124
2125 this->result.reg_offset += offset;
2126 }
2127
2128 /**
2129 * We want to be careful in assignment setup to hit the actual storage
2130 * instead of potentially using a temporary like we might with the
2131 * ir_dereference handler.
2132 */
2133 static dst_reg
2134 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2135 {
2136 /* The LHS must be a dereference. If the LHS is a variable indexed array
2137 * access of a vector, it must be separated into a series conditional moves
2138 * before reaching this point (see ir_vec_index_to_cond_assign).
2139 */
2140 assert(ir->as_dereference());
2141 ir_dereference_array *deref_array = ir->as_dereference_array();
2142 if (deref_array) {
2143 assert(!deref_array->array->type->is_vector());
2144 }
2145
2146 /* Use the rvalue deref handler for the most part. We'll ignore
2147 * swizzles in it and write swizzles using writemask, though.
2148 */
2149 ir->accept(v);
2150 return dst_reg(v->result);
2151 }
2152
2153 void
2154 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2155 const struct glsl_type *type,
2156 enum brw_predicate predicate)
2157 {
2158 if (type->base_type == GLSL_TYPE_STRUCT) {
2159 for (unsigned int i = 0; i < type->length; i++) {
2160 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2161 }
2162 return;
2163 }
2164
2165 if (type->is_array()) {
2166 for (unsigned int i = 0; i < type->length; i++) {
2167 emit_block_move(dst, src, type->fields.array, predicate);
2168 }
2169 return;
2170 }
2171
2172 if (type->is_matrix()) {
2173 const struct glsl_type *vec_type;
2174
2175 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2176 type->vector_elements, 1);
2177
2178 for (int i = 0; i < type->matrix_columns; i++) {
2179 emit_block_move(dst, src, vec_type, predicate);
2180 }
2181 return;
2182 }
2183
2184 assert(type->is_scalar() || type->is_vector());
2185
2186 dst->type = brw_type_for_base_type(type);
2187 src->type = dst->type;
2188
2189 dst->writemask = (1 << type->vector_elements) - 1;
2190
2191 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2192
2193 vec4_instruction *inst = emit(MOV(*dst, *src));
2194 inst->predicate = predicate;
2195
2196 dst->reg_offset++;
2197 src->reg_offset++;
2198 }
2199
2200
2201 /* If the RHS processing resulted in an instruction generating a
2202 * temporary value, and it would be easy to rewrite the instruction to
2203 * generate its result right into the LHS instead, do so. This ends
2204 * up reliably removing instructions where it can be tricky to do so
2205 * later without real UD chain information.
2206 */
2207 bool
2208 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2209 dst_reg dst,
2210 src_reg src,
2211 vec4_instruction *pre_rhs_inst,
2212 vec4_instruction *last_rhs_inst)
2213 {
2214 /* This could be supported, but it would take more smarts. */
2215 if (ir->condition)
2216 return false;
2217
2218 if (pre_rhs_inst == last_rhs_inst)
2219 return false; /* No instructions generated to work with. */
2220
2221 /* Make sure the last instruction generated our source reg. */
2222 if (src.file != GRF ||
2223 src.file != last_rhs_inst->dst.file ||
2224 src.reg != last_rhs_inst->dst.reg ||
2225 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2226 src.reladdr ||
2227 src.abs ||
2228 src.negate ||
2229 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2230 return false;
2231
2232 /* Check that that last instruction fully initialized the channels
2233 * we want to use, in the order we want to use them. We could
2234 * potentially reswizzle the operands of many instructions so that
2235 * we could handle out of order channels, but don't yet.
2236 */
2237
2238 for (unsigned i = 0; i < 4; i++) {
2239 if (dst.writemask & (1 << i)) {
2240 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2241 return false;
2242
2243 if (BRW_GET_SWZ(src.swizzle, i) != i)
2244 return false;
2245 }
2246 }
2247
2248 /* Success! Rewrite the instruction. */
2249 last_rhs_inst->dst.file = dst.file;
2250 last_rhs_inst->dst.reg = dst.reg;
2251 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2252 last_rhs_inst->dst.reladdr = dst.reladdr;
2253 last_rhs_inst->dst.writemask &= dst.writemask;
2254
2255 return true;
2256 }
2257
2258 void
2259 vec4_visitor::visit(ir_assignment *ir)
2260 {
2261 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2262 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2263
2264 if (!ir->lhs->type->is_scalar() &&
2265 !ir->lhs->type->is_vector()) {
2266 ir->rhs->accept(this);
2267 src_reg src = this->result;
2268
2269 if (ir->condition) {
2270 emit_bool_to_cond_code(ir->condition, &predicate);
2271 }
2272
2273 /* emit_block_move doesn't account for swizzles in the source register.
2274 * This should be ok, since the source register is a structure or an
2275 * array, and those can't be swizzled. But double-check to be sure.
2276 */
2277 assert(src.swizzle ==
2278 (ir->rhs->type->is_matrix()
2279 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2280 : BRW_SWIZZLE_NOOP));
2281
2282 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2283 return;
2284 }
2285
2286 /* Now we're down to just a scalar/vector with writemasks. */
2287 int i;
2288
2289 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2290 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2291
2292 ir->rhs->accept(this);
2293
2294 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2295
2296 int swizzles[4];
2297 int src_chan = 0;
2298
2299 assert(ir->lhs->type->is_vector() ||
2300 ir->lhs->type->is_scalar());
2301 dst.writemask = ir->write_mask;
2302
2303 /* Swizzle a small RHS vector into the channels being written.
2304 *
2305 * glsl ir treats write_mask as dictating how many channels are
2306 * present on the RHS while in our instructions we need to make
2307 * those channels appear in the slots of the vec4 they're written to.
2308 */
2309 for (int i = 0; i < 4; i++)
2310 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2311
2312 src_reg src = swizzle(this->result,
2313 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2314 swizzles[2], swizzles[3]));
2315
2316 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2317 return;
2318 }
2319
2320 if (ir->condition) {
2321 emit_bool_to_cond_code(ir->condition, &predicate);
2322 }
2323
2324 for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2325 vec4_instruction *inst = emit(MOV(dst, src));
2326 inst->predicate = predicate;
2327
2328 dst.reg_offset++;
2329 src.reg_offset++;
2330 }
2331 }
2332
2333 void
2334 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2335 {
2336 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2337 foreach_in_list(ir_constant, field_value, &ir->components) {
2338 emit_constant_values(dst, field_value);
2339 }
2340 return;
2341 }
2342
2343 if (ir->type->is_array()) {
2344 for (unsigned int i = 0; i < ir->type->length; i++) {
2345 emit_constant_values(dst, ir->array_elements[i]);
2346 }
2347 return;
2348 }
2349
2350 if (ir->type->is_matrix()) {
2351 for (int i = 0; i < ir->type->matrix_columns; i++) {
2352 float *vec = &ir->value.f[i * ir->type->vector_elements];
2353
2354 for (int j = 0; j < ir->type->vector_elements; j++) {
2355 dst->writemask = 1 << j;
2356 dst->type = BRW_REGISTER_TYPE_F;
2357
2358 emit(MOV(*dst, src_reg(vec[j])));
2359 }
2360 dst->reg_offset++;
2361 }
2362 return;
2363 }
2364
2365 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2366
2367 for (int i = 0; i < ir->type->vector_elements; i++) {
2368 if (!(remaining_writemask & (1 << i)))
2369 continue;
2370
2371 dst->writemask = 1 << i;
2372 dst->type = brw_type_for_base_type(ir->type);
2373
2374 /* Find other components that match the one we're about to
2375 * write. Emits fewer instructions for things like vec4(0.5,
2376 * 1.5, 1.5, 1.5).
2377 */
2378 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2379 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2380 if (ir->value.b[i] == ir->value.b[j])
2381 dst->writemask |= (1 << j);
2382 } else {
2383 /* u, i, and f storage all line up, so no need for a
2384 * switch case for comparing each type.
2385 */
2386 if (ir->value.u[i] == ir->value.u[j])
2387 dst->writemask |= (1 << j);
2388 }
2389 }
2390
2391 switch (ir->type->base_type) {
2392 case GLSL_TYPE_FLOAT:
2393 emit(MOV(*dst, src_reg(ir->value.f[i])));
2394 break;
2395 case GLSL_TYPE_INT:
2396 emit(MOV(*dst, src_reg(ir->value.i[i])));
2397 break;
2398 case GLSL_TYPE_UINT:
2399 emit(MOV(*dst, src_reg(ir->value.u[i])));
2400 break;
2401 case GLSL_TYPE_BOOL:
2402 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2403 break;
2404 default:
2405 unreachable("Non-float/uint/int/bool constant");
2406 }
2407
2408 remaining_writemask &= ~dst->writemask;
2409 }
2410 dst->reg_offset++;
2411 }
2412
2413 void
2414 vec4_visitor::visit(ir_constant *ir)
2415 {
2416 dst_reg dst = dst_reg(this, ir->type);
2417 this->result = src_reg(dst);
2418
2419 emit_constant_values(&dst, ir);
2420 }
2421
2422 void
2423 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2424 {
2425 ir_dereference *deref = static_cast<ir_dereference *>(
2426 ir->actual_parameters.get_head());
2427 ir_variable *location = deref->variable_referenced();
2428 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2429 location->data.binding);
2430
2431 /* Calculate the surface offset */
2432 src_reg offset(this, glsl_type::uint_type);
2433 ir_dereference_array *deref_array = deref->as_dereference_array();
2434 if (deref_array) {
2435 deref_array->array_index->accept(this);
2436
2437 src_reg tmp(this, glsl_type::uint_type);
2438 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2439 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2440 } else {
2441 offset = location->data.atomic.offset;
2442 }
2443
2444 /* Emit the appropriate machine instruction */
2445 const char *callee = ir->callee->function_name();
2446 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2447
2448 if (!strcmp("__intrinsic_atomic_read", callee)) {
2449 emit_untyped_surface_read(surf_index, dst, offset);
2450
2451 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2452 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2453 src_reg(), src_reg());
2454
2455 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2456 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2457 src_reg(), src_reg());
2458 }
2459
2460 brw_mark_surface_used(stage_prog_data, surf_index);
2461 }
2462
2463 void
2464 vec4_visitor::visit(ir_call *ir)
2465 {
2466 const char *callee = ir->callee->function_name();
2467
2468 if (!strcmp("__intrinsic_atomic_read", callee) ||
2469 !strcmp("__intrinsic_atomic_increment", callee) ||
2470 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2471 visit_atomic_counter_intrinsic(ir);
2472 } else {
2473 unreachable("Unsupported intrinsic.");
2474 }
2475 }
2476
2477 src_reg
2478 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2479 src_reg coordinate, src_reg sampler)
2480 {
2481 vec4_instruction *inst =
2482 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2483 dst_reg(this, glsl_type::uvec4_type));
2484 inst->base_mrf = 2;
2485 inst->src[1] = sampler;
2486
2487 int param_base;
2488
2489 if (devinfo->gen >= 9) {
2490 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2491 vec4_instruction *header_inst = new(mem_ctx)
2492 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2493 dst_reg(MRF, inst->base_mrf));
2494
2495 emit(header_inst);
2496
2497 inst->mlen = 2;
2498 inst->header_size = 1;
2499 param_base = inst->base_mrf + 1;
2500 } else {
2501 inst->mlen = 1;
2502 param_base = inst->base_mrf;
2503 }
2504
2505 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2506 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2507 int zero_mask = 0xf & ~coord_mask;
2508
2509 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2510 coordinate));
2511
2512 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2513 src_reg(0)));
2514
2515 emit(inst);
2516 return src_reg(inst->dst);
2517 }
2518
2519 bool
2520 vec4_visitor::is_high_sampler(src_reg sampler)
2521 {
2522 if (devinfo->gen < 8 && !devinfo->is_haswell)
2523 return false;
2524
2525 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2526 }
2527
2528 void
2529 vec4_visitor::emit_texture(ir_texture_opcode op,
2530 dst_reg dest,
2531 const glsl_type *dest_type,
2532 src_reg coordinate,
2533 int coord_components,
2534 src_reg shadow_comparitor,
2535 src_reg lod, src_reg lod2,
2536 src_reg sample_index,
2537 uint32_t constant_offset,
2538 src_reg offset_value,
2539 src_reg mcs,
2540 bool is_cube_array,
2541 uint32_t sampler,
2542 src_reg sampler_reg)
2543 {
2544 enum opcode opcode;
2545 switch (op) {
2546 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2547 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2548 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2549 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2550 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2551 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2552 case ir_tg4: opcode = offset_value.file != BAD_FILE
2553 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2554 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2555 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
2556 case ir_txb:
2557 unreachable("TXB is not valid for vertex shaders.");
2558 case ir_lod:
2559 unreachable("LOD is not valid for vertex shaders.");
2560 default:
2561 unreachable("Unrecognized tex op");
2562 }
2563
2564 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2565 opcode, dst_reg(this, dest_type));
2566
2567 inst->offset = constant_offset;
2568
2569 /* The message header is necessary for:
2570 * - Gen4 (always)
2571 * - Gen9+ for selecting SIMD4x2
2572 * - Texel offsets
2573 * - Gather channel selection
2574 * - Sampler indices too large to fit in a 4-bit value.
2575 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2576 */
2577 inst->header_size =
2578 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2579 inst->offset != 0 || op == ir_tg4 ||
2580 op == ir_texture_samples ||
2581 is_high_sampler(sampler_reg)) ? 1 : 0;
2582 inst->base_mrf = 2;
2583 inst->mlen = inst->header_size;
2584 inst->dst.writemask = WRITEMASK_XYZW;
2585 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2586
2587 inst->src[1] = sampler_reg;
2588
2589 /* MRF for the first parameter */
2590 int param_base = inst->base_mrf + inst->header_size;
2591
2592 if (op == ir_txs || op == ir_query_levels) {
2593 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2594 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2595 inst->mlen++;
2596 } else if (op == ir_texture_samples) {
2597 inst->dst.writemask = WRITEMASK_X;
2598 } else {
2599 /* Load the coordinate */
2600 /* FINISHME: gl_clamp_mask and saturate */
2601 int coord_mask = (1 << coord_components) - 1;
2602 int zero_mask = 0xf & ~coord_mask;
2603
2604 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2605 coordinate));
2606 inst->mlen++;
2607
2608 if (zero_mask != 0) {
2609 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2610 src_reg(0)));
2611 }
2612 /* Load the shadow comparitor */
2613 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2614 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2615 WRITEMASK_X),
2616 shadow_comparitor));
2617 inst->mlen++;
2618 }
2619
2620 /* Load the LOD info */
2621 if (op == ir_tex || op == ir_txl) {
2622 int mrf, writemask;
2623 if (devinfo->gen >= 5) {
2624 mrf = param_base + 1;
2625 if (shadow_comparitor.file != BAD_FILE) {
2626 writemask = WRITEMASK_Y;
2627 /* mlen already incremented */
2628 } else {
2629 writemask = WRITEMASK_X;
2630 inst->mlen++;
2631 }
2632 } else /* devinfo->gen == 4 */ {
2633 mrf = param_base;
2634 writemask = WRITEMASK_W;
2635 }
2636 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2637 } else if (op == ir_txf) {
2638 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2639 } else if (op == ir_txf_ms) {
2640 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2641 sample_index));
2642 if (devinfo->gen >= 7) {
2643 /* MCS data is in the first channel of `mcs`, but we need to get it into
2644 * the .y channel of the second vec4 of params, so replicate .x across
2645 * the whole vec4 and then mask off everything except .y
2646 */
2647 mcs.swizzle = BRW_SWIZZLE_XXXX;
2648 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2649 mcs));
2650 }
2651 inst->mlen++;
2652 } else if (op == ir_txd) {
2653 const brw_reg_type type = lod.type;
2654
2655 if (devinfo->gen >= 5) {
2656 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2657 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2658 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2659 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2660 inst->mlen++;
2661
2662 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2663 lod.swizzle = BRW_SWIZZLE_ZZZZ;
2664 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2665 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2666 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2667 inst->mlen++;
2668
2669 if (shadow_comparitor.file != BAD_FILE) {
2670 emit(MOV(dst_reg(MRF, param_base + 2,
2671 shadow_comparitor.type, WRITEMASK_Z),
2672 shadow_comparitor));
2673 }
2674 }
2675 } else /* devinfo->gen == 4 */ {
2676 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2677 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2678 inst->mlen += 2;
2679 }
2680 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2681 if (shadow_comparitor.file != BAD_FILE) {
2682 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2683 shadow_comparitor));
2684 }
2685
2686 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2687 offset_value));
2688 inst->mlen++;
2689 }
2690 }
2691
2692 emit(inst);
2693
2694 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2695 * spec requires layers.
2696 */
2697 if (op == ir_txs && is_cube_array) {
2698 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2699 writemask(inst->dst, WRITEMASK_Z),
2700 src_reg(inst->dst), src_reg(6));
2701 }
2702
2703 if (devinfo->gen == 6 && op == ir_tg4) {
2704 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2705 }
2706
2707 swizzle_result(op, dest,
2708 src_reg(inst->dst), sampler, dest_type);
2709 }
2710
2711 void
2712 vec4_visitor::visit(ir_texture *ir)
2713 {
2714 uint32_t sampler =
2715 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2716
2717 ir_rvalue *nonconst_sampler_index =
2718 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2719
2720 /* Handle non-constant sampler array indexing */
2721 src_reg sampler_reg;
2722 if (nonconst_sampler_index) {
2723 /* The highest sampler which may be used by this operation is
2724 * the last element of the array. Mark it here, because the generator
2725 * doesn't have enough information to determine the bound.
2726 */
2727 uint32_t array_size = ir->sampler->as_dereference_array()
2728 ->array->type->array_size();
2729
2730 uint32_t max_used = sampler + array_size - 1;
2731 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2732 max_used += prog_data->base.binding_table.gather_texture_start;
2733 } else {
2734 max_used += prog_data->base.binding_table.texture_start;
2735 }
2736
2737 brw_mark_surface_used(&prog_data->base, max_used);
2738
2739 /* Emit code to evaluate the actual indexing expression */
2740 nonconst_sampler_index->accept(this);
2741 src_reg temp(this, glsl_type::uint_type);
2742 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2743 sampler_reg = emit_uniformize(temp);
2744 } else {
2745 /* Single sampler, or constant array index; the indexing expression
2746 * is just an immediate.
2747 */
2748 sampler_reg = src_reg(sampler);
2749 }
2750
2751 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2752 * emitting anything other than setting up the constant result.
2753 */
2754 if (ir->op == ir_tg4) {
2755 ir_constant *chan = ir->lod_info.component->as_constant();
2756 int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2757 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2758 dst_reg result(this, ir->type);
2759 this->result = src_reg(result);
2760 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2761 return;
2762 }
2763 }
2764
2765 /* Should be lowered by do_lower_texture_projection */
2766 assert(!ir->projector);
2767
2768 /* Should be lowered */
2769 assert(!ir->offset || !ir->offset->type->is_array());
2770
2771 /* Generate code to compute all the subexpression trees. This has to be
2772 * done before loading any values into MRFs for the sampler message since
2773 * generating these values may involve SEND messages that need the MRFs.
2774 */
2775 src_reg coordinate;
2776 int coord_components = 0;
2777 if (ir->coordinate) {
2778 coord_components = ir->coordinate->type->vector_elements;
2779 ir->coordinate->accept(this);
2780 coordinate = this->result;
2781 }
2782
2783 src_reg shadow_comparitor;
2784 if (ir->shadow_comparitor) {
2785 ir->shadow_comparitor->accept(this);
2786 shadow_comparitor = this->result;
2787 }
2788
2789 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2790 src_reg offset_value;
2791 if (has_nonconstant_offset) {
2792 ir->offset->accept(this);
2793 offset_value = src_reg(this->result);
2794 }
2795
2796 src_reg lod, lod2, sample_index, mcs;
2797 switch (ir->op) {
2798 case ir_tex:
2799 lod = src_reg(0.0f);
2800 break;
2801 case ir_txf:
2802 case ir_txl:
2803 case ir_txs:
2804 ir->lod_info.lod->accept(this);
2805 lod = this->result;
2806 break;
2807 case ir_query_levels:
2808 lod = src_reg(0);
2809 break;
2810 case ir_txf_ms:
2811 ir->lod_info.sample_index->accept(this);
2812 sample_index = this->result;
2813
2814 if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2815 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2816 else
2817 mcs = src_reg(0u);
2818 break;
2819 case ir_txd:
2820 ir->lod_info.grad.dPdx->accept(this);
2821 lod = this->result;
2822
2823 ir->lod_info.grad.dPdy->accept(this);
2824 lod2 = this->result;
2825 break;
2826 case ir_txb:
2827 case ir_lod:
2828 case ir_tg4:
2829 case ir_texture_samples:
2830 break;
2831 }
2832
2833 uint32_t constant_offset = 0;
2834 if (ir->offset != NULL && !has_nonconstant_offset) {
2835 constant_offset =
2836 brw_texture_offset(ir->offset->as_constant()->value.i,
2837 ir->offset->type->vector_elements);
2838 }
2839
2840 /* Stuff the channel select bits in the top of the texture offset */
2841 if (ir->op == ir_tg4)
2842 constant_offset |=
2843 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2844 sampler) << 16;
2845
2846 glsl_type const *type = ir->sampler->type;
2847 bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2848 type->sampler_array;
2849
2850 this->result = src_reg(this, ir->type);
2851 dst_reg dest = dst_reg(this->result);
2852
2853 emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2854 shadow_comparitor,
2855 lod, lod2, sample_index,
2856 constant_offset, offset_value,
2857 mcs, is_cube_array, sampler, sampler_reg);
2858 }
2859
2860 /**
2861 * Apply workarounds for Gen6 gather with UINT/SINT
2862 */
2863 void
2864 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2865 {
2866 if (!wa)
2867 return;
2868
2869 int width = (wa & WA_8BIT) ? 8 : 16;
2870 dst_reg dst_f = dst;
2871 dst_f.type = BRW_REGISTER_TYPE_F;
2872
2873 /* Convert from UNORM to UINT */
2874 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2875 emit(MOV(dst, src_reg(dst_f)));
2876
2877 if (wa & WA_SIGN) {
2878 /* Reinterpret the UINT value as a signed INT value by
2879 * shifting the sign bit into place, then shifting back
2880 * preserving sign.
2881 */
2882 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2883 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2884 }
2885 }
2886
2887 /**
2888 * Set up the gather channel based on the swizzle, for gather4.
2889 */
2890 uint32_t
2891 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2892 {
2893 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2894 switch (swiz) {
2895 case SWIZZLE_X: return 0;
2896 case SWIZZLE_Y:
2897 /* gather4 sampler is broken for green channel on RG32F --
2898 * we must ask for blue instead.
2899 */
2900 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2901 return 2;
2902 return 1;
2903 case SWIZZLE_Z: return 2;
2904 case SWIZZLE_W: return 3;
2905 default:
2906 unreachable("Not reached"); /* zero, one swizzles handled already */
2907 }
2908 }
2909
2910 void
2911 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2912 src_reg orig_val, uint32_t sampler,
2913 const glsl_type *dest_type)
2914 {
2915 int s = key_tex->swizzles[sampler];
2916
2917 dst_reg swizzled_result = dest;
2918
2919 if (op == ir_query_levels) {
2920 /* # levels is in .w */
2921 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2922 emit(MOV(swizzled_result, orig_val));
2923 return;
2924 }
2925
2926 if (op == ir_txs || dest_type == glsl_type::float_type
2927 || s == SWIZZLE_NOOP || op == ir_tg4) {
2928 emit(MOV(swizzled_result, orig_val));
2929 return;
2930 }
2931
2932
2933 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2934 int swizzle[4] = {0};
2935
2936 for (int i = 0; i < 4; i++) {
2937 switch (GET_SWZ(s, i)) {
2938 case SWIZZLE_ZERO:
2939 zero_mask |= (1 << i);
2940 break;
2941 case SWIZZLE_ONE:
2942 one_mask |= (1 << i);
2943 break;
2944 default:
2945 copy_mask |= (1 << i);
2946 swizzle[i] = GET_SWZ(s, i);
2947 break;
2948 }
2949 }
2950
2951 if (copy_mask) {
2952 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2953 swizzled_result.writemask = copy_mask;
2954 emit(MOV(swizzled_result, orig_val));
2955 }
2956
2957 if (zero_mask) {
2958 swizzled_result.writemask = zero_mask;
2959 emit(MOV(swizzled_result, src_reg(0.0f)));
2960 }
2961
2962 if (one_mask) {
2963 swizzled_result.writemask = one_mask;
2964 emit(MOV(swizzled_result, src_reg(1.0f)));
2965 }
2966 }
2967
2968 void
2969 vec4_visitor::visit(ir_return *)
2970 {
2971 unreachable("not reached");
2972 }
2973
2974 void
2975 vec4_visitor::visit(ir_discard *)
2976 {
2977 unreachable("not reached");
2978 }
2979
2980 void
2981 vec4_visitor::visit(ir_if *ir)
2982 {
2983 /* Don't point the annotation at the if statement, because then it plus
2984 * the then and else blocks get printed.
2985 */
2986 this->base_ir = ir->condition;
2987
2988 if (devinfo->gen == 6) {
2989 emit_if_gen6(ir);
2990 } else {
2991 enum brw_predicate predicate;
2992 emit_bool_to_cond_code(ir->condition, &predicate);
2993 emit(IF(predicate));
2994 }
2995
2996 visit_instructions(&ir->then_instructions);
2997
2998 if (!ir->else_instructions.is_empty()) {
2999 this->base_ir = ir->condition;
3000 emit(BRW_OPCODE_ELSE);
3001
3002 visit_instructions(&ir->else_instructions);
3003 }
3004
3005 this->base_ir = ir->condition;
3006 emit(BRW_OPCODE_ENDIF);
3007 }
3008
3009 void
3010 vec4_visitor::gs_emit_vertex(int stream_id)
3011 {
3012 unreachable("not reached");
3013 }
3014
3015 void
3016 vec4_visitor::visit(ir_emit_vertex *)
3017 {
3018 unreachable("not reached");
3019 }
3020
3021 void
3022 vec4_visitor::gs_end_primitive()
3023 {
3024 unreachable("not reached");
3025 }
3026
3027
3028 void
3029 vec4_visitor::visit(ir_end_primitive *)
3030 {
3031 unreachable("not reached");
3032 }
3033
3034 void
3035 vec4_visitor::visit(ir_barrier *)
3036 {
3037 unreachable("not reached");
3038 }
3039
3040 void
3041 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3042 dst_reg dst, src_reg offset,
3043 src_reg src0, src_reg src1)
3044 {
3045 unsigned mlen = 0;
3046
3047 /* Set the atomic operation offset. */
3048 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3049 mlen++;
3050
3051 /* Set the atomic operation arguments. */
3052 if (src0.file != BAD_FILE) {
3053 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3054 mlen++;
3055 }
3056
3057 if (src1.file != BAD_FILE) {
3058 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3059 mlen++;
3060 }
3061
3062 /* Emit the instruction. Note that this maps to the normal SIMD8
3063 * untyped atomic message on Ivy Bridge, but that's OK because
3064 * unused channels will be masked out.
3065 */
3066 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3067 brw_message_reg(0),
3068 src_reg(surf_index), src_reg(atomic_op));
3069 inst->mlen = mlen;
3070 }
3071
3072 void
3073 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3074 src_reg offset)
3075 {
3076 /* Set the surface read offset. */
3077 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3078
3079 /* Emit the instruction. Note that this maps to the normal SIMD8
3080 * untyped surface read message, but that's OK because unused
3081 * channels will be masked out.
3082 */
3083 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3084 brw_message_reg(0),
3085 src_reg(surf_index), src_reg(1));
3086 inst->mlen = 1;
3087 }
3088
3089 void
3090 vec4_visitor::emit_ndc_computation()
3091 {
3092 /* Get the position */
3093 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3094
3095 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3096 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3097 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3098
3099 current_annotation = "NDC";
3100 dst_reg ndc_w = ndc;
3101 ndc_w.writemask = WRITEMASK_W;
3102 src_reg pos_w = pos;
3103 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3104 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3105
3106 dst_reg ndc_xyz = ndc;
3107 ndc_xyz.writemask = WRITEMASK_XYZ;
3108
3109 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3110 }
3111
3112 void
3113 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3114 {
3115 if (devinfo->gen < 6 &&
3116 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3117 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3118 devinfo->has_negative_rhw_bug)) {
3119 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3120 dst_reg header1_w = header1;
3121 header1_w.writemask = WRITEMASK_W;
3122
3123 emit(MOV(header1, 0u));
3124
3125 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3126 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3127
3128 current_annotation = "Point size";
3129 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3130 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3131 }
3132
3133 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3134 current_annotation = "Clipping flags";
3135 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3136 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3137
3138 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3139 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3140 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3141
3142 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3143 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3144 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3145 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3146 }
3147
3148 /* i965 clipping workaround:
3149 * 1) Test for -ve rhw
3150 * 2) If set,
3151 * set ndc = (0,0,0,0)
3152 * set ucp[6] = 1
3153 *
3154 * Later, clipping will detect ucp[6] and ensure the primitive is
3155 * clipped against all fixed planes.
3156 */
3157 if (devinfo->has_negative_rhw_bug) {
3158 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3159 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3160 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3161 vec4_instruction *inst;
3162 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3163 inst->predicate = BRW_PREDICATE_NORMAL;
3164 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3165 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3166 inst->predicate = BRW_PREDICATE_NORMAL;
3167 }
3168
3169 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3170 } else if (devinfo->gen < 6) {
3171 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3172 } else {
3173 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3174 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3175 dst_reg reg_w = reg;
3176 reg_w.writemask = WRITEMASK_W;
3177 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3178 reg_as_src.type = reg_w.type;
3179 reg_as_src.swizzle = brw_swizzle_for_size(1);
3180 emit(MOV(reg_w, reg_as_src));
3181 }
3182 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3183 dst_reg reg_y = reg;
3184 reg_y.writemask = WRITEMASK_Y;
3185 reg_y.type = BRW_REGISTER_TYPE_D;
3186 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3187 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3188 }
3189 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3190 dst_reg reg_z = reg;
3191 reg_z.writemask = WRITEMASK_Z;
3192 reg_z.type = BRW_REGISTER_TYPE_D;
3193 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3194 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3195 }
3196 }
3197 }
3198
3199 vec4_instruction *
3200 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3201 {
3202 assert(varying < VARYING_SLOT_MAX);
3203 assert(output_reg[varying].type == reg.type);
3204 current_annotation = output_reg_annotation[varying];
3205 /* Copy the register, saturating if necessary */
3206 return emit(MOV(reg, src_reg(output_reg[varying])));
3207 }
3208
3209 void
3210 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3211 {
3212 reg.type = BRW_REGISTER_TYPE_F;
3213 output_reg[varying].type = reg.type;
3214
3215 switch (varying) {
3216 case VARYING_SLOT_PSIZ:
3217 {
3218 /* PSIZ is always in slot 0, and is coupled with other flags. */
3219 current_annotation = "indices, point width, clip flags";
3220 emit_psiz_and_flags(reg);
3221 break;
3222 }
3223 case BRW_VARYING_SLOT_NDC:
3224 current_annotation = "NDC";
3225 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3226 break;
3227 case VARYING_SLOT_POS:
3228 current_annotation = "gl_Position";
3229 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3230 break;
3231 case VARYING_SLOT_EDGE:
3232 /* This is present when doing unfilled polygons. We're supposed to copy
3233 * the edge flag from the user-provided vertex array
3234 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3235 * of that attribute (starts as 1.0f). This is then used in clipping to
3236 * determine which edges should be drawn as wireframe.
3237 */
3238 current_annotation = "edge flag";
3239 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3240 glsl_type::float_type, WRITEMASK_XYZW))));
3241 break;
3242 case BRW_VARYING_SLOT_PAD:
3243 /* No need to write to this slot */
3244 break;
3245 default:
3246 emit_generic_urb_slot(reg, varying);
3247 break;
3248 }
3249 }
3250
3251 static int
3252 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3253 {
3254 if (devinfo->gen >= 6) {
3255 /* URB data written (does not include the message header reg) must
3256 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3257 * section 5.4.3.2.2: URB_INTERLEAVED.
3258 *
3259 * URB entries are allocated on a multiple of 1024 bits, so an
3260 * extra 128 bits written here to make the end align to 256 is
3261 * no problem.
3262 */
3263 if ((mlen % 2) != 1)
3264 mlen++;
3265 }
3266
3267 return mlen;
3268 }
3269
3270
3271 /**
3272 * Generates the VUE payload plus the necessary URB write instructions to
3273 * output it.
3274 *
3275 * The VUE layout is documented in Volume 2a.
3276 */
3277 void
3278 vec4_visitor::emit_vertex()
3279 {
3280 /* MRF 0 is reserved for the debugger, so start with message header
3281 * in MRF 1.
3282 */
3283 int base_mrf = 1;
3284 int mrf = base_mrf;
3285 /* In the process of generating our URB write message contents, we
3286 * may need to unspill a register or load from an array. Those
3287 * reads would use MRFs 14-15.
3288 */
3289 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
3290
3291 /* The following assertion verifies that max_usable_mrf causes an
3292 * even-numbered amount of URB write data, which will meet gen6's
3293 * requirements for length alignment.
3294 */
3295 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3296
3297 /* First mrf is the g0-based message header containing URB handles and
3298 * such.
3299 */
3300 emit_urb_write_header(mrf++);
3301
3302 if (devinfo->gen < 6) {
3303 emit_ndc_computation();
3304 }
3305
3306 /* We may need to split this up into several URB writes, so do them in a
3307 * loop.
3308 */
3309 int slot = 0;
3310 bool complete = false;
3311 do {
3312 /* URB offset is in URB row increments, and each of our MRFs is half of
3313 * one of those, since we're doing interleaved writes.
3314 */
3315 int offset = slot / 2;
3316
3317 mrf = base_mrf + 1;
3318 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3319 emit_urb_slot(dst_reg(MRF, mrf++),
3320 prog_data->vue_map.slot_to_varying[slot]);
3321
3322 /* If this was max_usable_mrf, we can't fit anything more into this
3323 * URB WRITE. Same thing if we reached the maximum length available.
3324 */
3325 if (mrf > max_usable_mrf ||
3326 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
3327 slot++;
3328 break;
3329 }
3330 }
3331
3332 complete = slot >= prog_data->vue_map.num_slots;
3333 current_annotation = "URB write";
3334 vec4_instruction *inst = emit_urb_write_opcode(complete);
3335 inst->base_mrf = base_mrf;
3336 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3337 inst->offset += offset;
3338 } while(!complete);
3339 }
3340
3341
3342 src_reg
3343 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3344 src_reg *reladdr, int reg_offset)
3345 {
3346 /* Because we store the values to scratch interleaved like our
3347 * vertex data, we need to scale the vec4 index by 2.
3348 */
3349 int message_header_scale = 2;
3350
3351 /* Pre-gen6, the message header uses byte offsets instead of vec4
3352 * (16-byte) offset units.
3353 */
3354 if (devinfo->gen < 6)
3355 message_header_scale *= 16;
3356
3357 if (reladdr) {
3358 src_reg index = src_reg(this, glsl_type::int_type);
3359
3360 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3361 src_reg(reg_offset)));
3362 emit_before(block, inst, MUL(dst_reg(index), index,
3363 src_reg(message_header_scale)));
3364
3365 return index;
3366 } else {
3367 return src_reg(reg_offset * message_header_scale);
3368 }
3369 }
3370
3371 src_reg
3372 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3373 src_reg *reladdr, int reg_offset)
3374 {
3375 if (reladdr) {
3376 src_reg index = src_reg(this, glsl_type::int_type);
3377
3378 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3379 src_reg(reg_offset)));
3380
3381 /* Pre-gen6, the message header uses byte offsets instead of vec4
3382 * (16-byte) offset units.
3383 */
3384 if (devinfo->gen < 6) {
3385 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3386 }
3387
3388 return index;
3389 } else if (devinfo->gen >= 8) {
3390 /* Store the offset in a GRF so we can send-from-GRF. */
3391 src_reg offset = src_reg(this, glsl_type::int_type);
3392 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3393 return offset;
3394 } else {
3395 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3396 return src_reg(reg_offset * message_header_scale);
3397 }
3398 }
3399
3400 /**
3401 * Emits an instruction before @inst to load the value named by @orig_src
3402 * from scratch space at @base_offset to @temp.
3403 *
3404 * @base_offset is measured in 32-byte units (the size of a register).
3405 */
3406 void
3407 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3408 dst_reg temp, src_reg orig_src,
3409 int base_offset)
3410 {
3411 int reg_offset = base_offset + orig_src.reg_offset;
3412 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3413 reg_offset);
3414
3415 emit_before(block, inst, SCRATCH_READ(temp, index));
3416 }
3417
3418 /**
3419 * Emits an instruction after @inst to store the value to be written
3420 * to @orig_dst to scratch space at @base_offset, from @temp.
3421 *
3422 * @base_offset is measured in 32-byte units (the size of a register).
3423 */
3424 void
3425 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3426 int base_offset)
3427 {
3428 int reg_offset = base_offset + inst->dst.reg_offset;
3429 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3430 reg_offset);
3431
3432 /* Create a temporary register to store *inst's result in.
3433 *
3434 * We have to be careful in MOVing from our temporary result register in
3435 * the scratch write. If we swizzle from channels of the temporary that
3436 * weren't initialized, it will confuse live interval analysis, which will
3437 * make spilling fail to make progress.
3438 */
3439 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3440 inst->dst.type),
3441 brw_swizzle_for_mask(inst->dst.writemask));
3442 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3443 inst->dst.writemask));
3444 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3445 if (inst->opcode != BRW_OPCODE_SEL)
3446 write->predicate = inst->predicate;
3447 write->ir = inst->ir;
3448 write->annotation = inst->annotation;
3449 inst->insert_after(block, write);
3450
3451 inst->dst.file = temp.file;
3452 inst->dst.reg = temp.reg;
3453 inst->dst.reg_offset = temp.reg_offset;
3454 inst->dst.reladdr = NULL;
3455 }
3456
3457 /**
3458 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3459 * adds the scratch read(s) before \p inst. The function also checks for
3460 * recursive reladdr scratch accesses, issuing the corresponding scratch
3461 * loads and rewriting reladdr references accordingly.
3462 *
3463 * \return \p src if it did not require a scratch load, otherwise, the
3464 * register holding the result of the scratch load that the caller should
3465 * use to rewrite src.
3466 */
3467 src_reg
3468 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3469 vec4_instruction *inst, src_reg src)
3470 {
3471 /* Resolve recursive reladdr scratch access by calling ourselves
3472 * with src.reladdr
3473 */
3474 if (src.reladdr)
3475 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3476 *src.reladdr);
3477
3478 /* Now handle scratch access on src */
3479 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3480 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3481 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3482 src.reg = temp.reg;
3483 src.reg_offset = temp.reg_offset;
3484 src.reladdr = NULL;
3485 }
3486
3487 return src;
3488 }
3489
3490 /**
3491 * We can't generally support array access in GRF space, because a
3492 * single instruction's destination can only span 2 contiguous
3493 * registers. So, we send all GRF arrays that get variable index
3494 * access to scratch space.
3495 */
3496 void
3497 vec4_visitor::move_grf_array_access_to_scratch()
3498 {
3499 int scratch_loc[this->alloc.count];
3500 memset(scratch_loc, -1, sizeof(scratch_loc));
3501
3502 /* First, calculate the set of virtual GRFs that need to be punted
3503 * to scratch due to having any array access on them, and where in
3504 * scratch.
3505 */
3506 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3507 if (inst->dst.file == GRF && inst->dst.reladdr) {
3508 if (scratch_loc[inst->dst.reg] == -1) {
3509 scratch_loc[inst->dst.reg] = last_scratch;
3510 last_scratch += this->alloc.sizes[inst->dst.reg];
3511 }
3512
3513 for (src_reg *iter = inst->dst.reladdr;
3514 iter->reladdr;
3515 iter = iter->reladdr) {
3516 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3517 scratch_loc[iter->reg] = last_scratch;
3518 last_scratch += this->alloc.sizes[iter->reg];
3519 }
3520 }
3521 }
3522
3523 for (int i = 0 ; i < 3; i++) {
3524 for (src_reg *iter = &inst->src[i];
3525 iter->reladdr;
3526 iter = iter->reladdr) {
3527 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3528 scratch_loc[iter->reg] = last_scratch;
3529 last_scratch += this->alloc.sizes[iter->reg];
3530 }
3531 }
3532 }
3533 }
3534
3535 /* Now, for anything that will be accessed through scratch, rewrite
3536 * it to load/store. Note that this is a _safe list walk, because
3537 * we may generate a new scratch_write instruction after the one
3538 * we're processing.
3539 */
3540 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3541 /* Set up the annotation tracking for new generated instructions. */
3542 base_ir = inst->ir;
3543 current_annotation = inst->annotation;
3544
3545 /* First handle scratch access on the dst. Notice we have to handle
3546 * the case where the dst's reladdr also points to scratch space.
3547 */
3548 if (inst->dst.reladdr)
3549 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3550 *inst->dst.reladdr);
3551
3552 /* Now that we have handled any (possibly recursive) reladdr scratch
3553 * accesses for dst we can safely do the scratch write for dst itself
3554 */
3555 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3556 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3557
3558 /* Now handle scratch access on any src. In this case, since inst->src[i]
3559 * already is a src_reg, we can just call emit_resolve_reladdr with
3560 * inst->src[i] and it will take care of handling scratch loads for
3561 * both src and src.reladdr (recursively).
3562 */
3563 for (int i = 0 ; i < 3; i++) {
3564 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3565 inst->src[i]);
3566 }
3567 }
3568 }
3569
3570 /**
3571 * Emits an instruction before @inst to load the value named by @orig_src
3572 * from the pull constant buffer (surface) at @base_offset to @temp.
3573 */
3574 void
3575 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3576 dst_reg temp, src_reg orig_src,
3577 int base_offset)
3578 {
3579 int reg_offset = base_offset + orig_src.reg_offset;
3580 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3581 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3582 reg_offset);
3583
3584 emit_pull_constant_load_reg(temp,
3585 index,
3586 offset,
3587 block, inst);
3588 }
3589
3590 /**
3591 * Implements array access of uniforms by inserting a
3592 * PULL_CONSTANT_LOAD instruction.
3593 *
3594 * Unlike temporary GRF array access (where we don't support it due to
3595 * the difficulty of doing relative addressing on instruction
3596 * destinations), we could potentially do array access of uniforms
3597 * that were loaded in GRF space as push constants. In real-world
3598 * usage we've seen, though, the arrays being used are always larger
3599 * than we could load as push constants, so just always move all
3600 * uniform array access out to a pull constant buffer.
3601 */
3602 void
3603 vec4_visitor::move_uniform_array_access_to_pull_constants()
3604 {
3605 int pull_constant_loc[this->uniforms];
3606 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3607 bool nested_reladdr;
3608
3609 /* Walk through and find array access of uniforms. Put a copy of that
3610 * uniform in the pull constant buffer.
3611 *
3612 * Note that we don't move constant-indexed accesses to arrays. No
3613 * testing has been done of the performance impact of this choice.
3614 */
3615 do {
3616 nested_reladdr = false;
3617
3618 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3619 for (int i = 0 ; i < 3; i++) {
3620 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3621 continue;
3622
3623 int uniform = inst->src[i].reg;
3624
3625 if (inst->src[i].reladdr->reladdr)
3626 nested_reladdr = true; /* will need another pass */
3627
3628 /* If this array isn't already present in the pull constant buffer,
3629 * add it.
3630 */
3631 if (pull_constant_loc[uniform] == -1) {
3632 const gl_constant_value **values =
3633 &stage_prog_data->param[uniform * 4];
3634
3635 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3636
3637 assert(uniform < uniform_array_size);
3638 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3639 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3640 = values[j];
3641 }
3642 }
3643
3644 /* Set up the annotation tracking for new generated instructions. */
3645 base_ir = inst->ir;
3646 current_annotation = inst->annotation;
3647
3648 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3649
3650 emit_pull_constant_load(block, inst, temp, inst->src[i],
3651 pull_constant_loc[uniform]);
3652
3653 inst->src[i].file = temp.file;
3654 inst->src[i].reg = temp.reg;
3655 inst->src[i].reg_offset = temp.reg_offset;
3656 inst->src[i].reladdr = NULL;
3657 }
3658 }
3659 } while (nested_reladdr);
3660
3661 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3662 * no need to track them as larger-than-vec4 objects. This will be
3663 * relied on in cutting out unused uniform vectors from push
3664 * constants.
3665 */
3666 split_uniform_registers();
3667 }
3668
3669 void
3670 vec4_visitor::resolve_ud_negate(src_reg *reg)
3671 {
3672 if (reg->type != BRW_REGISTER_TYPE_UD ||
3673 !reg->negate)
3674 return;
3675
3676 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3677 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3678 *reg = temp;
3679 }
3680
3681 /**
3682 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3683 *
3684 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3685 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3686 */
3687 void
3688 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3689 {
3690 assert(devinfo->gen <= 5);
3691
3692 if (!rvalue->type->is_boolean())
3693 return;
3694
3695 src_reg and_result = src_reg(this, rvalue->type);
3696 src_reg neg_result = src_reg(this, rvalue->type);
3697 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3698 emit(MOV(dst_reg(neg_result), negate(and_result)));
3699 *reg = neg_result;
3700 }
3701
3702 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3703 void *log_data,
3704 struct gl_program *prog,
3705 const struct brw_sampler_prog_key_data *key_tex,
3706 struct brw_vue_prog_data *prog_data,
3707 struct gl_shader_program *shader_prog,
3708 gl_shader_stage stage,
3709 void *mem_ctx,
3710 bool no_spills,
3711 int shader_time_index)
3712 : backend_shader(compiler, log_data, mem_ctx,
3713 shader_prog, prog, &prog_data->base, stage),
3714 key_tex(key_tex),
3715 prog_data(prog_data),
3716 sanity_param_count(0),
3717 fail_msg(NULL),
3718 first_non_payload_grf(0),
3719 need_all_constants_in_pull_buffer(false),
3720 no_spills(no_spills),
3721 shader_time_index(shader_time_index),
3722 last_scratch(0)
3723 {
3724 this->failed = false;
3725
3726 this->base_ir = NULL;
3727 this->current_annotation = NULL;
3728 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3729
3730 this->variable_ht = hash_table_ctor(0,
3731 hash_table_pointer_hash,
3732 hash_table_pointer_compare);
3733
3734 this->virtual_grf_start = NULL;
3735 this->virtual_grf_end = NULL;
3736 this->live_intervals = NULL;
3737
3738 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3739
3740 this->uniforms = 0;
3741
3742 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3743 * at least one. See setup_uniforms() in brw_vec4.cpp.
3744 */
3745 this->uniform_array_size = 1;
3746 if (prog_data) {
3747 this->uniform_array_size =
3748 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3749 }
3750
3751 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3752 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3753 }
3754
3755 vec4_visitor::~vec4_visitor()
3756 {
3757 hash_table_dtor(this->variable_ht);
3758 }
3759
3760
3761 void
3762 vec4_visitor::fail(const char *format, ...)
3763 {
3764 va_list va;
3765 char *msg;
3766
3767 if (failed)
3768 return;
3769
3770 failed = true;
3771
3772 va_start(va, format);
3773 msg = ralloc_vasprintf(mem_ctx, format, va);
3774 va_end(va);
3775 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3776
3777 this->fail_msg = msg;
3778
3779 if (debug_enabled) {
3780 fprintf(stderr, "%s", msg);
3781 }
3782 }
3783
3784 } /* namespace brw */