mesa: rename gl_shader_program's NumUniformBlocks to NumBufferInterfaceBlocks
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->predicate = BRW_PREDICATE_NONE;
49 this->predicate_inverse = false;
50 this->target = 0;
51 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
52 this->shadow_compare = false;
53 this->ir = NULL;
54 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
55 this->header_size = 0;
56 this->flag_subreg = 0;
57 this->mlen = 0;
58 this->base_mrf = 0;
59 this->offset = 0;
60 this->annotation = NULL;
61 }
62
63 vec4_instruction *
64 vec4_visitor::emit(vec4_instruction *inst)
65 {
66 inst->ir = this->base_ir;
67 inst->annotation = this->current_annotation;
68
69 this->instructions.push_tail(inst);
70
71 return inst;
72 }
73
74 vec4_instruction *
75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
76 vec4_instruction *new_inst)
77 {
78 new_inst->ir = inst->ir;
79 new_inst->annotation = inst->annotation;
80
81 inst->insert_before(block, new_inst);
82
83 return inst;
84 }
85
86 vec4_instruction *
87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
88 const src_reg &src1, const src_reg &src2)
89 {
90 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
91 }
92
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
96 const src_reg &src1)
97 {
98 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
103 {
104 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
109 {
110 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
111 }
112
113 vec4_instruction *
114 vec4_visitor::emit(enum opcode opcode)
115 {
116 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
117 }
118
119 #define ALU1(op) \
120 vec4_instruction * \
121 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
122 { \
123 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
124 }
125
126 #define ALU2(op) \
127 vec4_instruction * \
128 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
129 const src_reg &src1) \
130 { \
131 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
132 src0, src1); \
133 }
134
135 #define ALU2_ACC(op) \
136 vec4_instruction * \
137 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
138 const src_reg &src1) \
139 { \
140 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
141 BRW_OPCODE_##op, dst, src0, src1); \
142 inst->writes_accumulator = true; \
143 return inst; \
144 }
145
146 #define ALU3(op) \
147 vec4_instruction * \
148 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
149 const src_reg &src1, const src_reg &src2) \
150 { \
151 assert(devinfo->gen >= 6); \
152 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
153 src0, src1, src2); \
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU1(F32TO16)
163 ALU1(F16TO32)
164 ALU2(ADD)
165 ALU2(MUL)
166 ALU2_ACC(MACH)
167 ALU2(AND)
168 ALU2(OR)
169 ALU2(XOR)
170 ALU2(DP3)
171 ALU2(DP4)
172 ALU2(DPH)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(MAC)
188
189 /** Gen4 predicated IF. */
190 vec4_instruction *
191 vec4_visitor::IF(enum brw_predicate predicate)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
196 inst->predicate = predicate;
197
198 return inst;
199 }
200
201 /** Gen6 IF with embedded comparison. */
202 vec4_instruction *
203 vec4_visitor::IF(src_reg src0, src_reg src1,
204 enum brw_conditional_mod condition)
205 {
206 assert(devinfo->gen == 6);
207
208 vec4_instruction *inst;
209
210 resolve_ud_negate(&src0);
211 resolve_ud_negate(&src1);
212
213 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
214 src0, src1);
215 inst->conditional_mod = condition;
216
217 return inst;
218 }
219
220 /**
221 * CMP: Sets the low bit of the destination channels with the result
222 * of the comparison, while the upper bits are undefined, and updates
223 * the flag register with the packed 16 bits of the result.
224 */
225 vec4_instruction *
226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
227 enum brw_conditional_mod condition)
228 {
229 vec4_instruction *inst;
230
231 /* Take the instruction:
232 *
233 * CMP null<d> src0<f> src1<f>
234 *
235 * Original gen4 does type conversion to the destination type before
236 * comparison, producing garbage results for floating point comparisons.
237 *
238 * The destination type doesn't matter on newer generations, so we set the
239 * type to match src0 so we can compact the instruction.
240 */
241 dst.type = src0.type;
242 if (dst.file == HW_REG)
243 dst.fixed_hw_reg.type = dst.type;
244
245 resolve_ud_negate(&src0);
246 resolve_ud_negate(&src1);
247
248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249 inst->conditional_mod = condition;
250
251 return inst;
252 }
253
254 vec4_instruction *
255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257 vec4_instruction *inst;
258
259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 dst, index);
261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262 inst->mlen = 2;
263
264 return inst;
265 }
266
267 vec4_instruction *
268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269 const src_reg &index)
270 {
271 vec4_instruction *inst;
272
273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 dst, src, index);
275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276 inst->mlen = 3;
277
278 return inst;
279 }
280
281 void
282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
283 {
284 static enum opcode dot_opcodes[] = {
285 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
286 };
287
288 emit(dot_opcodes[elements - 2], dst, src0, src1);
289 }
290
291 src_reg
292 vec4_visitor::fix_3src_operand(const src_reg &src)
293 {
294 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
295 * able to use vertical stride of zero to replicate the vec4 uniform, like
296 *
297 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
298 *
299 * But you can't, since vertical stride is always four in three-source
300 * instructions. Instead, insert a MOV instruction to do the replication so
301 * that the three-source instruction can consume it.
302 */
303
304 /* The MOV is only needed if the source is a uniform or immediate. */
305 if (src.file != UNIFORM && src.file != IMM)
306 return src;
307
308 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
309 return src;
310
311 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
312 expanded.type = src.type;
313 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
314 return src_reg(expanded);
315 }
316
317 src_reg
318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
319 {
320 if (!src.abs && !src.negate)
321 return src;
322
323 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
324 resolved.type = src.type;
325 emit(MOV(resolved, src));
326
327 return src_reg(resolved);
328 }
329
330 src_reg
331 vec4_visitor::fix_math_operand(const src_reg &src)
332 {
333 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
334 return src;
335
336 /* The gen6 math instruction ignores the source modifiers --
337 * swizzle, abs, negate, and at least some parts of the register
338 * region description.
339 *
340 * Rather than trying to enumerate all these cases, *always* expand the
341 * operand to a temp GRF for gen6.
342 *
343 * For gen7, keep the operand as-is, except if immediate, which gen7 still
344 * can't use.
345 */
346
347 if (devinfo->gen == 7 && src.file != IMM)
348 return src;
349
350 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
351 expanded.type = src.type;
352 emit(MOV(expanded, src));
353 return src_reg(expanded);
354 }
355
356 vec4_instruction *
357 vec4_visitor::emit_math(enum opcode opcode,
358 const dst_reg &dst,
359 const src_reg &src0, const src_reg &src1)
360 {
361 vec4_instruction *math =
362 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
363
364 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
365 /* MATH on Gen6 must be align1, so we can't do writemasks. */
366 math->dst = dst_reg(this, glsl_type::vec4_type);
367 math->dst.type = dst.type;
368 math = emit(MOV(dst, src_reg(math->dst)));
369 } else if (devinfo->gen < 6) {
370 math->base_mrf = 1;
371 math->mlen = src1.file == BAD_FILE ? 1 : 2;
372 }
373
374 return math;
375 }
376
377 void
378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
379 {
380 if (devinfo->gen < 7) {
381 unreachable("ir_unop_pack_half_2x16 should be lowered");
382 }
383
384 assert(dst.type == BRW_REGISTER_TYPE_UD);
385 assert(src0.type == BRW_REGISTER_TYPE_F);
386
387 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
388 *
389 * Because this instruction does not have a 16-bit floating-point type,
390 * the destination data type must be Word (W).
391 *
392 * The destination must be DWord-aligned and specify a horizontal stride
393 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
394 * each destination channel and the upper word is not modified.
395 *
396 * The above restriction implies that the f32to16 instruction must use
397 * align1 mode, because only in align1 mode is it possible to specify
398 * horizontal stride. We choose here to defy the hardware docs and emit
399 * align16 instructions.
400 *
401 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
402 * instructions. I was partially successful in that the code passed all
403 * tests. However, the code was dubiously correct and fragile, and the
404 * tests were not harsh enough to probe that frailty. Not trusting the
405 * code, I chose instead to remain in align16 mode in defiance of the hw
406 * docs).
407 *
408 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
409 * simulator, emitting a f32to16 in align16 mode with UD as destination
410 * data type is safe. The behavior differs from that specified in the PRM
411 * in that the upper word of each destination channel is cleared to 0.
412 */
413
414 dst_reg tmp_dst(this, glsl_type::uvec2_type);
415 src_reg tmp_src(tmp_dst);
416
417 #if 0
418 /* Verify the undocumented behavior on which the following instructions
419 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
420 * then the result of the bit-or instruction below will be incorrect.
421 *
422 * You should inspect the disasm output in order to verify that the MOV is
423 * not optimized away.
424 */
425 emit(MOV(tmp_dst, src_reg(0x12345678u)));
426 #endif
427
428 /* Give tmp the form below, where "." means untouched.
429 *
430 * w z y x w z y x
431 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
432 *
433 * That the upper word of each write-channel be 0 is required for the
434 * following bit-shift and bit-or instructions to work. Note that this
435 * relies on the undocumented hardware behavior mentioned above.
436 */
437 tmp_dst.writemask = WRITEMASK_XY;
438 emit(F32TO16(tmp_dst, src0));
439
440 /* Give the write-channels of dst the form:
441 * 0xhhhh0000
442 */
443 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
444 emit(SHL(dst, tmp_src, src_reg(16u)));
445
446 /* Finally, give the write-channels of dst the form of packHalf2x16's
447 * output:
448 * 0xhhhhllll
449 */
450 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
451 emit(OR(dst, src_reg(dst), tmp_src));
452 }
453
454 void
455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
456 {
457 if (devinfo->gen < 7) {
458 unreachable("ir_unop_unpack_half_2x16 should be lowered");
459 }
460
461 assert(dst.type == BRW_REGISTER_TYPE_F);
462 assert(src0.type == BRW_REGISTER_TYPE_UD);
463
464 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
465 *
466 * Because this instruction does not have a 16-bit floating-point type,
467 * the source data type must be Word (W). The destination type must be
468 * F (Float).
469 *
470 * To use W as the source data type, we must adjust horizontal strides,
471 * which is only possible in align1 mode. All my [chadv] attempts at
472 * emitting align1 instructions for unpackHalf2x16 failed to pass the
473 * Piglit tests, so I gave up.
474 *
475 * I've verified that, on gen7 hardware and the simulator, it is safe to
476 * emit f16to32 in align16 mode with UD as source data type.
477 */
478
479 dst_reg tmp_dst(this, glsl_type::uvec2_type);
480 src_reg tmp_src(tmp_dst);
481
482 tmp_dst.writemask = WRITEMASK_X;
483 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
484
485 tmp_dst.writemask = WRITEMASK_Y;
486 emit(SHR(tmp_dst, src0, src_reg(16u)));
487
488 dst.writemask = WRITEMASK_XY;
489 emit(F16TO32(dst, tmp_src));
490 }
491
492 void
493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497 * is not suitable to generate the shift values, but we can use the packed
498 * vector float and a type-converting MOV.
499 */
500 dst_reg shift(this, glsl_type::uvec4_type);
501 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
502
503 dst_reg shifted(this, glsl_type::uvec4_type);
504 src0.swizzle = BRW_SWIZZLE_XXXX;
505 emit(SHR(shifted, src0, src_reg(shift)));
506
507 shifted.type = BRW_REGISTER_TYPE_UB;
508 dst_reg f(this, glsl_type::vec4_type);
509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510
511 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
512 }
513
514 void
515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
516 {
517 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
518 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
519 * is not suitable to generate the shift values, but we can use the packed
520 * vector float and a type-converting MOV.
521 */
522 dst_reg shift(this, glsl_type::uvec4_type);
523 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
524
525 dst_reg shifted(this, glsl_type::uvec4_type);
526 src0.swizzle = BRW_SWIZZLE_XXXX;
527 emit(SHR(shifted, src0, src_reg(shift)));
528
529 shifted.type = BRW_REGISTER_TYPE_B;
530 dst_reg f(this, glsl_type::vec4_type);
531 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
532
533 dst_reg scaled(this, glsl_type::vec4_type);
534 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
535
536 dst_reg max(this, glsl_type::vec4_type);
537 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
538 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
539 }
540
541 void
542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
543 {
544 dst_reg saturated(this, glsl_type::vec4_type);
545 vec4_instruction *inst = emit(MOV(saturated, src0));
546 inst->saturate = true;
547
548 dst_reg scaled(this, glsl_type::vec4_type);
549 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
550
551 dst_reg rounded(this, glsl_type::vec4_type);
552 emit(RNDE(rounded, src_reg(scaled)));
553
554 dst_reg u(this, glsl_type::uvec4_type);
555 emit(MOV(u, src_reg(rounded)));
556
557 src_reg bytes(u);
558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560
561 void
562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
563 {
564 dst_reg max(this, glsl_type::vec4_type);
565 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
566
567 dst_reg min(this, glsl_type::vec4_type);
568 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
569
570 dst_reg scaled(this, glsl_type::vec4_type);
571 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
572
573 dst_reg rounded(this, glsl_type::vec4_type);
574 emit(RNDE(rounded, src_reg(scaled)));
575
576 dst_reg i(this, glsl_type::ivec4_type);
577 emit(MOV(i, src_reg(rounded)));
578
579 src_reg bytes(i);
580 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
581 }
582
583 void
584 vec4_visitor::visit_instructions(const exec_list *list)
585 {
586 foreach_in_list(ir_instruction, ir, list) {
587 base_ir = ir;
588 ir->accept(this);
589 }
590 }
591
592 /**
593 * Returns the minimum number of vec4 elements needed to pack a type.
594 *
595 * For simple types, it will return 1 (a single vec4); for matrices, the
596 * number of columns; for array and struct, the sum of the vec4_size of
597 * each of its elements; and for sampler and atomic, zero.
598 *
599 * This method is useful to calculate how much register space is needed to
600 * store a particular type.
601 */
602 extern "C" int
603 type_size_vec4(const struct glsl_type *type)
604 {
605 unsigned int i;
606 int size;
607
608 switch (type->base_type) {
609 case GLSL_TYPE_UINT:
610 case GLSL_TYPE_INT:
611 case GLSL_TYPE_FLOAT:
612 case GLSL_TYPE_BOOL:
613 if (type->is_matrix()) {
614 return type->matrix_columns;
615 } else {
616 /* Regardless of size of vector, it gets a vec4. This is bad
617 * packing for things like floats, but otherwise arrays become a
618 * mess. Hopefully a later pass over the code can pack scalars
619 * down if appropriate.
620 */
621 return 1;
622 }
623 case GLSL_TYPE_ARRAY:
624 assert(type->length > 0);
625 return type_size_vec4(type->fields.array) * type->length;
626 case GLSL_TYPE_STRUCT:
627 size = 0;
628 for (i = 0; i < type->length; i++) {
629 size += type_size_vec4(type->fields.structure[i].type);
630 }
631 return size;
632 case GLSL_TYPE_SUBROUTINE:
633 return 1;
634
635 case GLSL_TYPE_SAMPLER:
636 /* Samplers take up no register space, since they're baked in at
637 * link time.
638 */
639 return 0;
640 case GLSL_TYPE_ATOMIC_UINT:
641 return 0;
642 case GLSL_TYPE_IMAGE:
643 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
644 case GLSL_TYPE_VOID:
645 case GLSL_TYPE_DOUBLE:
646 case GLSL_TYPE_ERROR:
647 case GLSL_TYPE_INTERFACE:
648 unreachable("not reached");
649 }
650
651 return 0;
652 }
653
654 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
655 {
656 init();
657
658 this->file = GRF;
659 this->reg = v->alloc.allocate(type_size_vec4(type));
660
661 if (type->is_array() || type->is_record()) {
662 this->swizzle = BRW_SWIZZLE_NOOP;
663 } else {
664 this->swizzle = brw_swizzle_for_size(type->vector_elements);
665 }
666
667 this->type = brw_type_for_base_type(type);
668 }
669
670 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
671 {
672 assert(size > 0);
673
674 init();
675
676 this->file = GRF;
677 this->reg = v->alloc.allocate(type_size_vec4(type) * size);
678
679 this->swizzle = BRW_SWIZZLE_NOOP;
680
681 this->type = brw_type_for_base_type(type);
682 }
683
684 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
685 {
686 init();
687
688 this->file = GRF;
689 this->reg = v->alloc.allocate(type_size_vec4(type));
690
691 if (type->is_array() || type->is_record()) {
692 this->writemask = WRITEMASK_XYZW;
693 } else {
694 this->writemask = (1 << type->vector_elements) - 1;
695 }
696
697 this->type = brw_type_for_base_type(type);
698 }
699
700 void
701 vec4_visitor::setup_vec4_uniform_value(unsigned param_offset,
702 const gl_constant_value *values,
703 unsigned n)
704 {
705 static const gl_constant_value zero = { 0 };
706
707 assert(param_offset % 4 == 0);
708
709 for (unsigned i = 0; i < n; ++i)
710 stage_prog_data->param[param_offset + i] = &values[i];
711
712 for (unsigned i = n; i < 4; ++i)
713 stage_prog_data->param[param_offset + i] = &zero;
714
715 uniform_vector_size[param_offset / 4] = n;
716 }
717
718 /* Our support for uniforms is piggy-backed on the struct
719 * gl_fragment_program, because that's where the values actually
720 * get stored, rather than in some global gl_shader_program uniform
721 * store.
722 */
723 void
724 vec4_visitor::setup_uniform_values(ir_variable *ir)
725 {
726 int namelen = strlen(ir->name);
727
728 /* The data for our (non-builtin) uniforms is stored in a series of
729 * gl_uniform_driver_storage structs for each subcomponent that
730 * glGetUniformLocation() could name. We know it's been set up in the same
731 * order we'd walk the type, so walk the list of storage and find anything
732 * with our name, or the prefix of a component that starts with our name.
733 */
734 for (unsigned u = 0; u < shader_prog->NumUniformStorage; u++) {
735 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
736
737 if (storage->builtin)
738 continue;
739
740 if (strncmp(ir->name, storage->name, namelen) != 0 ||
741 (storage->name[namelen] != 0 &&
742 storage->name[namelen] != '.' &&
743 storage->name[namelen] != '[')) {
744 continue;
745 }
746
747 const unsigned vector_count = (MAX2(storage->array_elements, 1) *
748 storage->type->matrix_columns);
749 const unsigned vector_size = storage->type->vector_elements;
750
751 for (unsigned s = 0; s < vector_count; s++) {
752 setup_vec4_uniform_value(uniforms * 4,
753 &storage->storage[s * vector_size],
754 vector_size);
755 uniforms++;
756 }
757 }
758 }
759
760 /* Our support for builtin uniforms is even scarier than non-builtin.
761 * It sits on top of the PROG_STATE_VAR parameters that are
762 * automatically updated from GL context state.
763 */
764 void
765 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
766 {
767 const ir_state_slot *const slots = ir->get_state_slots();
768 assert(slots != NULL);
769
770 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
771 /* This state reference has already been setup by ir_to_mesa,
772 * but we'll get the same index back here. We can reference
773 * ParameterValues directly, since unlike brw_fs.cpp, we never
774 * add new state references during compile.
775 */
776 int index = _mesa_add_state_reference(this->prog->Parameters,
777 (gl_state_index *)slots[i].tokens);
778 gl_constant_value *values =
779 &this->prog->Parameters->ParameterValues[index][0];
780
781 assert(this->uniforms < uniform_array_size);
782
783 for (unsigned j = 0; j < 4; j++)
784 stage_prog_data->param[this->uniforms * 4 + j] =
785 &values[GET_SWZ(slots[i].swizzle, j)];
786
787 this->uniform_vector_size[this->uniforms] =
788 (ir->type->is_scalar() || ir->type->is_vector() ||
789 ir->type->is_matrix() ? ir->type->vector_elements : 4);
790
791 this->uniforms++;
792 }
793 }
794
795 dst_reg *
796 vec4_visitor::variable_storage(ir_variable *var)
797 {
798 return (dst_reg *)hash_table_find(this->variable_ht, var);
799 }
800
801 void
802 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
803 enum brw_predicate *predicate)
804 {
805 ir_expression *expr = ir->as_expression();
806
807 *predicate = BRW_PREDICATE_NORMAL;
808
809 if (expr && expr->operation != ir_binop_ubo_load) {
810 src_reg op[3];
811 vec4_instruction *inst;
812
813 assert(expr->get_num_operands() <= 3);
814 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
815 expr->operands[i]->accept(this);
816 op[i] = this->result;
817
818 resolve_ud_negate(&op[i]);
819 }
820
821 switch (expr->operation) {
822 case ir_unop_logic_not:
823 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
824 inst->conditional_mod = BRW_CONDITIONAL_Z;
825 break;
826
827 case ir_binop_logic_xor:
828 if (devinfo->gen <= 5) {
829 src_reg temp = src_reg(this, ir->type);
830 emit(XOR(dst_reg(temp), op[0], op[1]));
831 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
832 } else {
833 inst = emit(XOR(dst_null_d(), op[0], op[1]));
834 }
835 inst->conditional_mod = BRW_CONDITIONAL_NZ;
836 break;
837
838 case ir_binop_logic_or:
839 if (devinfo->gen <= 5) {
840 src_reg temp = src_reg(this, ir->type);
841 emit(OR(dst_reg(temp), op[0], op[1]));
842 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
843 } else {
844 inst = emit(OR(dst_null_d(), op[0], op[1]));
845 }
846 inst->conditional_mod = BRW_CONDITIONAL_NZ;
847 break;
848
849 case ir_binop_logic_and:
850 if (devinfo->gen <= 5) {
851 src_reg temp = src_reg(this, ir->type);
852 emit(AND(dst_reg(temp), op[0], op[1]));
853 inst = emit(AND(dst_null_d(), temp, src_reg(1)));
854 } else {
855 inst = emit(AND(dst_null_d(), op[0], op[1]));
856 }
857 inst->conditional_mod = BRW_CONDITIONAL_NZ;
858 break;
859
860 case ir_unop_f2b:
861 if (devinfo->gen >= 6) {
862 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
863 } else {
864 inst = emit(MOV(dst_null_f(), op[0]));
865 inst->conditional_mod = BRW_CONDITIONAL_NZ;
866 }
867 break;
868
869 case ir_unop_i2b:
870 if (devinfo->gen >= 6) {
871 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
872 } else {
873 inst = emit(MOV(dst_null_d(), op[0]));
874 inst->conditional_mod = BRW_CONDITIONAL_NZ;
875 }
876 break;
877
878 case ir_binop_all_equal:
879 if (devinfo->gen <= 5) {
880 resolve_bool_comparison(expr->operands[0], &op[0]);
881 resolve_bool_comparison(expr->operands[1], &op[1]);
882 }
883 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
884 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
885 break;
886
887 case ir_binop_any_nequal:
888 if (devinfo->gen <= 5) {
889 resolve_bool_comparison(expr->operands[0], &op[0]);
890 resolve_bool_comparison(expr->operands[1], &op[1]);
891 }
892 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
893 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
894 break;
895
896 case ir_unop_any:
897 if (devinfo->gen <= 5) {
898 resolve_bool_comparison(expr->operands[0], &op[0]);
899 }
900 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
901 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
902 break;
903
904 case ir_binop_greater:
905 case ir_binop_gequal:
906 case ir_binop_less:
907 case ir_binop_lequal:
908 case ir_binop_equal:
909 case ir_binop_nequal:
910 if (devinfo->gen <= 5) {
911 resolve_bool_comparison(expr->operands[0], &op[0]);
912 resolve_bool_comparison(expr->operands[1], &op[1]);
913 }
914 emit(CMP(dst_null_d(), op[0], op[1],
915 brw_conditional_for_comparison(expr->operation)));
916 break;
917
918 case ir_triop_csel: {
919 /* Expand the boolean condition into the flag register. */
920 inst = emit(MOV(dst_null_d(), op[0]));
921 inst->conditional_mod = BRW_CONDITIONAL_NZ;
922
923 /* Select which boolean to return. */
924 dst_reg temp(this, expr->operands[1]->type);
925 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
926 inst->predicate = BRW_PREDICATE_NORMAL;
927
928 /* Expand the result to a condition code. */
929 inst = emit(MOV(dst_null_d(), src_reg(temp)));
930 inst->conditional_mod = BRW_CONDITIONAL_NZ;
931 break;
932 }
933
934 default:
935 unreachable("not reached");
936 }
937 return;
938 }
939
940 ir->accept(this);
941
942 resolve_ud_negate(&this->result);
943
944 vec4_instruction *inst = emit(AND(dst_null_d(), this->result, src_reg(1)));
945 inst->conditional_mod = BRW_CONDITIONAL_NZ;
946 }
947
948 /**
949 * Emit a gen6 IF statement with the comparison folded into the IF
950 * instruction.
951 */
952 void
953 vec4_visitor::emit_if_gen6(ir_if *ir)
954 {
955 ir_expression *expr = ir->condition->as_expression();
956
957 if (expr && expr->operation != ir_binop_ubo_load) {
958 src_reg op[3];
959 dst_reg temp;
960
961 assert(expr->get_num_operands() <= 3);
962 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
963 expr->operands[i]->accept(this);
964 op[i] = this->result;
965 }
966
967 switch (expr->operation) {
968 case ir_unop_logic_not:
969 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
970 return;
971
972 case ir_binop_logic_xor:
973 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
974 return;
975
976 case ir_binop_logic_or:
977 temp = dst_reg(this, glsl_type::bool_type);
978 emit(OR(temp, op[0], op[1]));
979 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
980 return;
981
982 case ir_binop_logic_and:
983 temp = dst_reg(this, glsl_type::bool_type);
984 emit(AND(temp, op[0], op[1]));
985 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
986 return;
987
988 case ir_unop_f2b:
989 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
990 return;
991
992 case ir_unop_i2b:
993 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
994 return;
995
996 case ir_binop_greater:
997 case ir_binop_gequal:
998 case ir_binop_less:
999 case ir_binop_lequal:
1000 case ir_binop_equal:
1001 case ir_binop_nequal:
1002 emit(IF(op[0], op[1],
1003 brw_conditional_for_comparison(expr->operation)));
1004 return;
1005
1006 case ir_binop_all_equal:
1007 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1008 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
1009 return;
1010
1011 case ir_binop_any_nequal:
1012 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1013 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1014 return;
1015
1016 case ir_unop_any:
1017 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1018 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
1019 return;
1020
1021 case ir_triop_csel: {
1022 /* Expand the boolean condition into the flag register. */
1023 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
1024 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1025
1026 /* Select which boolean to return. */
1027 dst_reg temp(this, expr->operands[1]->type);
1028 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
1029 inst->predicate = BRW_PREDICATE_NORMAL;
1030
1031 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
1032 return;
1033 }
1034
1035 default:
1036 unreachable("not reached");
1037 }
1038 return;
1039 }
1040
1041 ir->condition->accept(this);
1042
1043 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1044 }
1045
1046 void
1047 vec4_visitor::visit(ir_variable *ir)
1048 {
1049 dst_reg *reg = NULL;
1050
1051 if (variable_storage(ir))
1052 return;
1053
1054 switch (ir->data.mode) {
1055 case ir_var_shader_in:
1056 assert(ir->data.location != -1);
1057 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1058 break;
1059
1060 case ir_var_shader_out:
1061 assert(ir->data.location != -1);
1062 reg = new(mem_ctx) dst_reg(this, ir->type);
1063
1064 for (int i = 0; i < type_size_vec4(ir->type); i++) {
1065 output_reg[ir->data.location + i] = *reg;
1066 output_reg[ir->data.location + i].reg_offset = i;
1067 output_reg_annotation[ir->data.location + i] = ir->name;
1068 }
1069 break;
1070
1071 case ir_var_auto:
1072 case ir_var_temporary:
1073 reg = new(mem_ctx) dst_reg(this, ir->type);
1074 break;
1075
1076 case ir_var_uniform:
1077 case ir_var_shader_storage:
1078 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1079
1080 /* Thanks to the lower_ubo_reference pass, we will see only
1081 * ir_binop_{ubo,ssbo}_load expressions and not ir_dereference_variable
1082 * for UBO/SSBO variables, so no need for them to be in variable_ht.
1083 *
1084 * Some uniforms, such as samplers and atomic counters, have no actual
1085 * storage, so we should ignore them.
1086 */
1087 if (ir->is_in_buffer_block() || type_size_vec4(ir->type) == 0)
1088 return;
1089
1090 /* Track how big the whole uniform variable is, in case we need to put a
1091 * copy of its data into pull constants for array access.
1092 */
1093 assert(this->uniforms < uniform_array_size);
1094 this->uniform_size[this->uniforms] = type_size_vec4(ir->type);
1095
1096 if (!strncmp(ir->name, "gl_", 3)) {
1097 setup_builtin_uniform_values(ir);
1098 } else {
1099 setup_uniform_values(ir);
1100 }
1101 break;
1102
1103 case ir_var_system_value:
1104 reg = make_reg_for_system_value(ir->data.location, ir->type);
1105 break;
1106
1107 default:
1108 unreachable("not reached");
1109 }
1110
1111 reg->type = brw_type_for_base_type(ir->type);
1112 hash_table_insert(this->variable_ht, reg, ir);
1113 }
1114
1115 void
1116 vec4_visitor::visit(ir_loop *ir)
1117 {
1118 /* We don't want debugging output to print the whole body of the
1119 * loop as the annotation.
1120 */
1121 this->base_ir = NULL;
1122
1123 emit(BRW_OPCODE_DO);
1124
1125 visit_instructions(&ir->body_instructions);
1126
1127 emit(BRW_OPCODE_WHILE);
1128 }
1129
1130 void
1131 vec4_visitor::visit(ir_loop_jump *ir)
1132 {
1133 switch (ir->mode) {
1134 case ir_loop_jump::jump_break:
1135 emit(BRW_OPCODE_BREAK);
1136 break;
1137 case ir_loop_jump::jump_continue:
1138 emit(BRW_OPCODE_CONTINUE);
1139 break;
1140 }
1141 }
1142
1143
1144 void
1145 vec4_visitor::visit(ir_function_signature *)
1146 {
1147 unreachable("not reached");
1148 }
1149
1150 void
1151 vec4_visitor::visit(ir_function *ir)
1152 {
1153 /* Ignore function bodies other than main() -- we shouldn't see calls to
1154 * them since they should all be inlined.
1155 */
1156 if (strcmp(ir->name, "main") == 0) {
1157 const ir_function_signature *sig;
1158 exec_list empty;
1159
1160 sig = ir->matching_signature(NULL, &empty, false);
1161
1162 assert(sig);
1163
1164 visit_instructions(&sig->body);
1165 }
1166 }
1167
1168 bool
1169 vec4_visitor::try_emit_mad(ir_expression *ir)
1170 {
1171 /* 3-src instructions were introduced in gen6. */
1172 if (devinfo->gen < 6)
1173 return false;
1174
1175 /* MAD can only handle floating-point data. */
1176 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1177 return false;
1178
1179 ir_rvalue *nonmul;
1180 ir_expression *mul;
1181 bool mul_negate, mul_abs;
1182
1183 for (int i = 0; i < 2; i++) {
1184 mul_negate = false;
1185 mul_abs = false;
1186
1187 mul = ir->operands[i]->as_expression();
1188 nonmul = ir->operands[1 - i];
1189
1190 if (mul && mul->operation == ir_unop_abs) {
1191 mul = mul->operands[0]->as_expression();
1192 mul_abs = true;
1193 } else if (mul && mul->operation == ir_unop_neg) {
1194 mul = mul->operands[0]->as_expression();
1195 mul_negate = true;
1196 }
1197
1198 if (mul && mul->operation == ir_binop_mul)
1199 break;
1200 }
1201
1202 if (!mul || mul->operation != ir_binop_mul)
1203 return false;
1204
1205 nonmul->accept(this);
1206 src_reg src0 = fix_3src_operand(this->result);
1207
1208 mul->operands[0]->accept(this);
1209 src_reg src1 = fix_3src_operand(this->result);
1210 src1.negate ^= mul_negate;
1211 src1.abs = mul_abs;
1212 if (mul_abs)
1213 src1.negate = false;
1214
1215 mul->operands[1]->accept(this);
1216 src_reg src2 = fix_3src_operand(this->result);
1217 src2.abs = mul_abs;
1218 if (mul_abs)
1219 src2.negate = false;
1220
1221 this->result = src_reg(this, ir->type);
1222 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1223
1224 return true;
1225 }
1226
1227 bool
1228 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1229 {
1230 /* This optimization relies on CMP setting the destination to 0 when
1231 * false. Early hardware only sets the least significant bit, and
1232 * leaves the other bits undefined. So we can't use it.
1233 */
1234 if (devinfo->gen < 6)
1235 return false;
1236
1237 ir_expression *const cmp = ir->operands[0]->as_expression();
1238
1239 if (cmp == NULL)
1240 return false;
1241
1242 switch (cmp->operation) {
1243 case ir_binop_less:
1244 case ir_binop_greater:
1245 case ir_binop_lequal:
1246 case ir_binop_gequal:
1247 case ir_binop_equal:
1248 case ir_binop_nequal:
1249 break;
1250
1251 default:
1252 return false;
1253 }
1254
1255 cmp->operands[0]->accept(this);
1256 const src_reg cmp_src0 = this->result;
1257
1258 cmp->operands[1]->accept(this);
1259 const src_reg cmp_src1 = this->result;
1260
1261 this->result = src_reg(this, ir->type);
1262
1263 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1264 brw_conditional_for_comparison(cmp->operation)));
1265
1266 /* If the comparison is false, this->result will just happen to be zero.
1267 */
1268 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1269 this->result, src_reg(1.0f));
1270 inst->predicate = BRW_PREDICATE_NORMAL;
1271 inst->predicate_inverse = true;
1272
1273 return true;
1274 }
1275
1276 vec4_instruction *
1277 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1278 src_reg src0, src_reg src1)
1279 {
1280 vec4_instruction *inst;
1281
1282 if (devinfo->gen >= 6) {
1283 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1284 inst->conditional_mod = conditionalmod;
1285 } else {
1286 emit(CMP(dst, src0, src1, conditionalmod));
1287
1288 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1289 inst->predicate = BRW_PREDICATE_NORMAL;
1290 }
1291
1292 return inst;
1293 }
1294
1295 vec4_instruction *
1296 vec4_visitor::emit_lrp(const dst_reg &dst,
1297 const src_reg &x, const src_reg &y, const src_reg &a)
1298 {
1299 if (devinfo->gen >= 6) {
1300 /* Note that the instruction's argument order is reversed from GLSL
1301 * and the IR.
1302 */
1303 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
1304 fix_3src_operand(x)));
1305 } else {
1306 /* Earlier generations don't support three source operations, so we
1307 * need to emit x*(1-a) + y*a.
1308 */
1309 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1310 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1311 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1312 y_times_a.writemask = dst.writemask;
1313 one_minus_a.writemask = dst.writemask;
1314 x_times_one_minus_a.writemask = dst.writemask;
1315
1316 emit(MUL(y_times_a, y, a));
1317 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1318 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1319 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1320 }
1321 }
1322
1323 /**
1324 * Emits the instructions needed to perform a pull constant load. before_block
1325 * and before_inst can be NULL in which case the instruction will be appended
1326 * to the end of the instruction list.
1327 */
1328 void
1329 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
1330 src_reg surf_index,
1331 src_reg offset_reg,
1332 bblock_t *before_block,
1333 vec4_instruction *before_inst)
1334 {
1335 assert((before_inst == NULL && before_block == NULL) ||
1336 (before_inst && before_block));
1337
1338 vec4_instruction *pull;
1339
1340 if (devinfo->gen >= 9) {
1341 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
1342 src_reg header(this, glsl_type::uvec4_type, 2);
1343
1344 pull = new(mem_ctx)
1345 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
1346 dst_reg(header));
1347
1348 if (before_inst)
1349 emit_before(before_block, before_inst, pull);
1350 else
1351 emit(pull);
1352
1353 dst_reg index_reg = retype(offset(dst_reg(header), 1),
1354 offset_reg.type);
1355 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
1356
1357 if (before_inst)
1358 emit_before(before_block, before_inst, pull);
1359 else
1360 emit(pull);
1361
1362 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1363 dst,
1364 surf_index,
1365 header);
1366 pull->mlen = 2;
1367 pull->header_size = 1;
1368 } else if (devinfo->gen >= 7) {
1369 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1370
1371 grf_offset.type = offset_reg.type;
1372
1373 pull = MOV(grf_offset, offset_reg);
1374
1375 if (before_inst)
1376 emit_before(before_block, before_inst, pull);
1377 else
1378 emit(pull);
1379
1380 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1381 dst,
1382 surf_index,
1383 src_reg(grf_offset));
1384 pull->mlen = 1;
1385 } else {
1386 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
1387 dst,
1388 surf_index,
1389 offset_reg);
1390 pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
1391 pull->mlen = 1;
1392 }
1393
1394 if (before_inst)
1395 emit_before(before_block, before_inst, pull);
1396 else
1397 emit(pull);
1398 }
1399
1400 src_reg
1401 vec4_visitor::emit_uniformize(const src_reg &src)
1402 {
1403 const src_reg chan_index(this, glsl_type::uint_type);
1404 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
1405 src.type);
1406
1407 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
1408 ->force_writemask_all = true;
1409 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
1410 ->force_writemask_all = true;
1411
1412 return src_reg(dst);
1413 }
1414
1415 void
1416 vec4_visitor::visit(ir_expression *ir)
1417 {
1418 unsigned int operand;
1419 src_reg op[ARRAY_SIZE(ir->operands)];
1420 vec4_instruction *inst;
1421
1422 if (ir->operation == ir_binop_add) {
1423 if (try_emit_mad(ir))
1424 return;
1425 }
1426
1427 if (ir->operation == ir_unop_b2f) {
1428 if (try_emit_b2f_of_compare(ir))
1429 return;
1430 }
1431
1432 /* Storage for our result. Ideally for an assignment we'd be using
1433 * the actual storage for the result here, instead.
1434 */
1435 dst_reg result_dst(this, ir->type);
1436 src_reg result_src(result_dst);
1437
1438 if (ir->operation == ir_triop_csel) {
1439 ir->operands[1]->accept(this);
1440 op[1] = this->result;
1441 ir->operands[2]->accept(this);
1442 op[2] = this->result;
1443
1444 enum brw_predicate predicate;
1445 emit_bool_to_cond_code(ir->operands[0], &predicate);
1446 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1447 inst->predicate = predicate;
1448 this->result = result_src;
1449 return;
1450 }
1451
1452 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1453 this->result.file = BAD_FILE;
1454 ir->operands[operand]->accept(this);
1455 if (this->result.file == BAD_FILE) {
1456 fprintf(stderr, "Failed to get tree for expression operand:\n");
1457 ir->operands[operand]->fprint(stderr);
1458 exit(1);
1459 }
1460 op[operand] = this->result;
1461
1462 /* Matrix expression operands should have been broken down to vector
1463 * operations already.
1464 */
1465 assert(!ir->operands[operand]->type->is_matrix());
1466 }
1467
1468 /* If nothing special happens, this is the result. */
1469 this->result = result_src;
1470
1471 switch (ir->operation) {
1472 case ir_unop_logic_not:
1473 emit(NOT(result_dst, op[0]));
1474 break;
1475 case ir_unop_neg:
1476 op[0].negate = !op[0].negate;
1477 emit(MOV(result_dst, op[0]));
1478 break;
1479 case ir_unop_abs:
1480 op[0].abs = true;
1481 op[0].negate = false;
1482 emit(MOV(result_dst, op[0]));
1483 break;
1484
1485 case ir_unop_sign:
1486 if (ir->type->is_float()) {
1487 /* AND(val, 0x80000000) gives the sign bit.
1488 *
1489 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1490 * zero.
1491 */
1492 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1493
1494 op[0].type = BRW_REGISTER_TYPE_UD;
1495 result_dst.type = BRW_REGISTER_TYPE_UD;
1496 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1497
1498 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1499 inst->predicate = BRW_PREDICATE_NORMAL;
1500
1501 this->result.type = BRW_REGISTER_TYPE_F;
1502 } else {
1503 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1504 * -> non-negative val generates 0x00000000.
1505 * Predicated OR sets 1 if val is positive.
1506 */
1507 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1508
1509 emit(ASR(result_dst, op[0], src_reg(31)));
1510
1511 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1512 inst->predicate = BRW_PREDICATE_NORMAL;
1513 }
1514 break;
1515
1516 case ir_unop_rcp:
1517 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1518 break;
1519
1520 case ir_unop_exp2:
1521 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1522 break;
1523 case ir_unop_log2:
1524 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1525 break;
1526 case ir_unop_exp:
1527 case ir_unop_log:
1528 unreachable("not reached: should be handled by ir_explog_to_explog2");
1529 case ir_unop_sin:
1530 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1531 break;
1532 case ir_unop_cos:
1533 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1534 break;
1535
1536 case ir_unop_dFdx:
1537 case ir_unop_dFdx_coarse:
1538 case ir_unop_dFdx_fine:
1539 case ir_unop_dFdy:
1540 case ir_unop_dFdy_coarse:
1541 case ir_unop_dFdy_fine:
1542 unreachable("derivatives not valid in vertex shader");
1543
1544 case ir_unop_bitfield_reverse:
1545 emit(BFREV(result_dst, op[0]));
1546 break;
1547 case ir_unop_bit_count:
1548 emit(CBIT(result_dst, op[0]));
1549 break;
1550 case ir_unop_find_msb: {
1551 src_reg temp = src_reg(this, glsl_type::uint_type);
1552
1553 inst = emit(FBH(dst_reg(temp), op[0]));
1554 inst->dst.writemask = WRITEMASK_XYZW;
1555
1556 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1557 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1558 * subtract the result from 31 to convert the MSB count into an LSB count.
1559 */
1560
1561 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1562 temp.swizzle = BRW_SWIZZLE_NOOP;
1563 emit(MOV(result_dst, temp));
1564
1565 src_reg src_tmp = src_reg(result_dst);
1566 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1567
1568 src_tmp.negate = true;
1569 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1570 inst->predicate = BRW_PREDICATE_NORMAL;
1571 break;
1572 }
1573 case ir_unop_find_lsb:
1574 emit(FBL(result_dst, op[0]));
1575 break;
1576 case ir_unop_saturate:
1577 inst = emit(MOV(result_dst, op[0]));
1578 inst->saturate = true;
1579 break;
1580
1581 case ir_unop_noise:
1582 unreachable("not reached: should be handled by lower_noise");
1583
1584 case ir_unop_subroutine_to_int:
1585 emit(MOV(result_dst, op[0]));
1586 break;
1587
1588 case ir_unop_ssbo_unsized_array_length:
1589 unreachable("not reached: should be handled by lower_ubo_reference");
1590 break;
1591
1592 case ir_binop_add:
1593 emit(ADD(result_dst, op[0], op[1]));
1594 break;
1595 case ir_binop_sub:
1596 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1597
1598 case ir_binop_mul:
1599 if (devinfo->gen < 8 && ir->type->is_integer()) {
1600 /* For integer multiplication, the MUL uses the low 16 bits of one of
1601 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1602 * accumulates in the contribution of the upper 16 bits of that
1603 * operand. If we can determine that one of the args is in the low
1604 * 16 bits, though, we can just emit a single MUL.
1605 */
1606 if (ir->operands[0]->is_uint16_constant()) {
1607 if (devinfo->gen < 7)
1608 emit(MUL(result_dst, op[0], op[1]));
1609 else
1610 emit(MUL(result_dst, op[1], op[0]));
1611 } else if (ir->operands[1]->is_uint16_constant()) {
1612 if (devinfo->gen < 7)
1613 emit(MUL(result_dst, op[1], op[0]));
1614 else
1615 emit(MUL(result_dst, op[0], op[1]));
1616 } else {
1617 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1618
1619 emit(MUL(acc, op[0], op[1]));
1620 emit(MACH(dst_null_d(), op[0], op[1]));
1621 emit(MOV(result_dst, src_reg(acc)));
1622 }
1623 } else {
1624 emit(MUL(result_dst, op[0], op[1]));
1625 }
1626 break;
1627 case ir_binop_imul_high: {
1628 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1629
1630 emit(MUL(acc, op[0], op[1]));
1631 emit(MACH(result_dst, op[0], op[1]));
1632 break;
1633 }
1634 case ir_binop_div:
1635 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1636 assert(ir->type->is_integer());
1637 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1638 break;
1639
1640 case ir_binop_carry:
1641 unreachable("Should have been lowered by carry_to_arith().");
1642
1643 case ir_binop_borrow:
1644 unreachable("Should have been lowered by borrow_to_arith().");
1645
1646 case ir_binop_mod:
1647 /* Floating point should be lowered by MOD_TO_FLOOR in the compiler. */
1648 assert(ir->type->is_integer());
1649 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1650 break;
1651
1652 case ir_binop_less:
1653 case ir_binop_greater:
1654 case ir_binop_lequal:
1655 case ir_binop_gequal:
1656 case ir_binop_equal:
1657 case ir_binop_nequal: {
1658 if (devinfo->gen <= 5) {
1659 resolve_bool_comparison(ir->operands[0], &op[0]);
1660 resolve_bool_comparison(ir->operands[1], &op[1]);
1661 }
1662 emit(CMP(result_dst, op[0], op[1],
1663 brw_conditional_for_comparison(ir->operation)));
1664 break;
1665 }
1666
1667 case ir_binop_all_equal:
1668 if (devinfo->gen <= 5) {
1669 resolve_bool_comparison(ir->operands[0], &op[0]);
1670 resolve_bool_comparison(ir->operands[1], &op[1]);
1671 }
1672
1673 /* "==" operator producing a scalar boolean. */
1674 if (ir->operands[0]->type->is_vector() ||
1675 ir->operands[1]->type->is_vector()) {
1676 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1677 emit(MOV(result_dst, src_reg(0)));
1678 inst = emit(MOV(result_dst, src_reg(~0)));
1679 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1680 } else {
1681 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1682 }
1683 break;
1684 case ir_binop_any_nequal:
1685 if (devinfo->gen <= 5) {
1686 resolve_bool_comparison(ir->operands[0], &op[0]);
1687 resolve_bool_comparison(ir->operands[1], &op[1]);
1688 }
1689
1690 /* "!=" operator producing a scalar boolean. */
1691 if (ir->operands[0]->type->is_vector() ||
1692 ir->operands[1]->type->is_vector()) {
1693 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1694
1695 emit(MOV(result_dst, src_reg(0)));
1696 inst = emit(MOV(result_dst, src_reg(~0)));
1697 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1698 } else {
1699 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1700 }
1701 break;
1702
1703 case ir_unop_any:
1704 if (devinfo->gen <= 5) {
1705 resolve_bool_comparison(ir->operands[0], &op[0]);
1706 }
1707 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1708 emit(MOV(result_dst, src_reg(0)));
1709
1710 inst = emit(MOV(result_dst, src_reg(~0)));
1711 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1712 break;
1713
1714 case ir_binop_logic_xor:
1715 emit(XOR(result_dst, op[0], op[1]));
1716 break;
1717
1718 case ir_binop_logic_or:
1719 emit(OR(result_dst, op[0], op[1]));
1720 break;
1721
1722 case ir_binop_logic_and:
1723 emit(AND(result_dst, op[0], op[1]));
1724 break;
1725
1726 case ir_binop_dot:
1727 assert(ir->operands[0]->type->is_vector());
1728 assert(ir->operands[0]->type == ir->operands[1]->type);
1729 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1730 break;
1731
1732 case ir_unop_sqrt:
1733 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1734 break;
1735 case ir_unop_rsq:
1736 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1737 break;
1738
1739 case ir_unop_bitcast_i2f:
1740 case ir_unop_bitcast_u2f:
1741 this->result = op[0];
1742 this->result.type = BRW_REGISTER_TYPE_F;
1743 break;
1744
1745 case ir_unop_bitcast_f2i:
1746 this->result = op[0];
1747 this->result.type = BRW_REGISTER_TYPE_D;
1748 break;
1749
1750 case ir_unop_bitcast_f2u:
1751 this->result = op[0];
1752 this->result.type = BRW_REGISTER_TYPE_UD;
1753 break;
1754
1755 case ir_unop_i2f:
1756 case ir_unop_i2u:
1757 case ir_unop_u2i:
1758 case ir_unop_u2f:
1759 case ir_unop_f2i:
1760 case ir_unop_f2u:
1761 emit(MOV(result_dst, op[0]));
1762 break;
1763 case ir_unop_b2i:
1764 case ir_unop_b2f:
1765 if (devinfo->gen <= 5) {
1766 resolve_bool_comparison(ir->operands[0], &op[0]);
1767 }
1768 emit(MOV(result_dst, negate(op[0])));
1769 break;
1770 case ir_unop_f2b:
1771 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1772 break;
1773 case ir_unop_i2b:
1774 emit(CMP(result_dst, op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1775 break;
1776
1777 case ir_unop_trunc:
1778 emit(RNDZ(result_dst, op[0]));
1779 break;
1780 case ir_unop_ceil: {
1781 src_reg tmp = src_reg(this, ir->type);
1782 op[0].negate = !op[0].negate;
1783 emit(RNDD(dst_reg(tmp), op[0]));
1784 tmp.negate = true;
1785 emit(MOV(result_dst, tmp));
1786 }
1787 break;
1788 case ir_unop_floor:
1789 inst = emit(RNDD(result_dst, op[0]));
1790 break;
1791 case ir_unop_fract:
1792 inst = emit(FRC(result_dst, op[0]));
1793 break;
1794 case ir_unop_round_even:
1795 emit(RNDE(result_dst, op[0]));
1796 break;
1797
1798 case ir_unop_get_buffer_size:
1799 unreachable("not reached: not implemented");
1800 break;
1801
1802 case ir_binop_min:
1803 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1804 break;
1805 case ir_binop_max:
1806 emit_minmax(BRW_CONDITIONAL_GE, result_dst, op[0], op[1]);
1807 break;
1808
1809 case ir_binop_pow:
1810 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1811 break;
1812
1813 case ir_unop_bit_not:
1814 inst = emit(NOT(result_dst, op[0]));
1815 break;
1816 case ir_binop_bit_and:
1817 inst = emit(AND(result_dst, op[0], op[1]));
1818 break;
1819 case ir_binop_bit_xor:
1820 inst = emit(XOR(result_dst, op[0], op[1]));
1821 break;
1822 case ir_binop_bit_or:
1823 inst = emit(OR(result_dst, op[0], op[1]));
1824 break;
1825
1826 case ir_binop_lshift:
1827 inst = emit(SHL(result_dst, op[0], op[1]));
1828 break;
1829
1830 case ir_binop_rshift:
1831 if (ir->type->base_type == GLSL_TYPE_INT)
1832 inst = emit(ASR(result_dst, op[0], op[1]));
1833 else
1834 inst = emit(SHR(result_dst, op[0], op[1]));
1835 break;
1836
1837 case ir_binop_bfm:
1838 emit(BFI1(result_dst, op[0], op[1]));
1839 break;
1840
1841 case ir_binop_ubo_load: {
1842 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1843 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1844 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1845 src_reg offset;
1846
1847 /* Now, load the vector from that offset. */
1848 assert(ir->type->is_vector() || ir->type->is_scalar());
1849
1850 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1851 packed_consts.type = result.type;
1852 src_reg surf_index;
1853
1854 if (const_uniform_block) {
1855 /* The block index is a constant, so just emit the binding table entry
1856 * as an immediate.
1857 */
1858 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1859 const_uniform_block->value.u[0]);
1860 } else {
1861 /* The block index is not a constant. Evaluate the index expression
1862 * per-channel and add the base UBO index; we have to select a value
1863 * from any live channel.
1864 */
1865 surf_index = src_reg(this, glsl_type::uint_type);
1866 emit(ADD(dst_reg(surf_index), op[0],
1867 src_reg(prog_data->base.binding_table.ubo_start)));
1868 surf_index = emit_uniformize(surf_index);
1869
1870 /* Assume this may touch any UBO. It would be nice to provide
1871 * a tighter bound, but the array information is already lowered away.
1872 */
1873 brw_mark_surface_used(&prog_data->base,
1874 prog_data->base.binding_table.ubo_start +
1875 shader_prog->NumBufferInterfaceBlocks - 1);
1876 }
1877
1878 if (const_offset_ir) {
1879 if (devinfo->gen >= 8) {
1880 /* Store the offset in a GRF so we can send-from-GRF. */
1881 offset = src_reg(this, glsl_type::int_type);
1882 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1883 } else {
1884 /* Immediates are fine on older generations since they'll be moved
1885 * to a (potentially fake) MRF at the generator level.
1886 */
1887 offset = src_reg(const_offset / 16);
1888 }
1889 } else {
1890 offset = src_reg(this, glsl_type::uint_type);
1891 emit(SHR(dst_reg(offset), op[1], src_reg(4u)));
1892 }
1893
1894 emit_pull_constant_load_reg(dst_reg(packed_consts),
1895 surf_index,
1896 offset,
1897 NULL, NULL /* before_block/inst */);
1898
1899 packed_consts.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
1900 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1901 const_offset % 16 / 4,
1902 const_offset % 16 / 4,
1903 const_offset % 16 / 4);
1904
1905 /* UBO bools are any nonzero int. We need to convert them to 0/~0. */
1906 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1907 emit(CMP(result_dst, packed_consts, src_reg(0u),
1908 BRW_CONDITIONAL_NZ));
1909 } else {
1910 emit(MOV(result_dst, packed_consts));
1911 }
1912 break;
1913 }
1914
1915 case ir_binop_vector_extract:
1916 unreachable("should have been lowered by vec_index_to_cond_assign");
1917
1918 case ir_triop_fma:
1919 op[0] = fix_3src_operand(op[0]);
1920 op[1] = fix_3src_operand(op[1]);
1921 op[2] = fix_3src_operand(op[2]);
1922 /* Note that the instruction's argument order is reversed from GLSL
1923 * and the IR.
1924 */
1925 emit(MAD(result_dst, op[2], op[1], op[0]));
1926 break;
1927
1928 case ir_triop_lrp:
1929 emit_lrp(result_dst, op[0], op[1], op[2]);
1930 break;
1931
1932 case ir_triop_csel:
1933 unreachable("already handled above");
1934 break;
1935
1936 case ir_triop_bfi:
1937 op[0] = fix_3src_operand(op[0]);
1938 op[1] = fix_3src_operand(op[1]);
1939 op[2] = fix_3src_operand(op[2]);
1940 emit(BFI2(result_dst, op[0], op[1], op[2]));
1941 break;
1942
1943 case ir_triop_bitfield_extract:
1944 op[0] = fix_3src_operand(op[0]);
1945 op[1] = fix_3src_operand(op[1]);
1946 op[2] = fix_3src_operand(op[2]);
1947 /* Note that the instruction's argument order is reversed from GLSL
1948 * and the IR.
1949 */
1950 emit(BFE(result_dst, op[2], op[1], op[0]));
1951 break;
1952
1953 case ir_triop_vector_insert:
1954 unreachable("should have been lowered by lower_vector_insert");
1955
1956 case ir_quadop_bitfield_insert:
1957 unreachable("not reached: should be handled by "
1958 "bitfield_insert_to_bfm_bfi\n");
1959
1960 case ir_quadop_vector:
1961 unreachable("not reached: should be handled by lower_quadop_vector");
1962
1963 case ir_unop_pack_half_2x16:
1964 emit_pack_half_2x16(result_dst, op[0]);
1965 break;
1966 case ir_unop_unpack_half_2x16:
1967 emit_unpack_half_2x16(result_dst, op[0]);
1968 break;
1969 case ir_unop_unpack_unorm_4x8:
1970 emit_unpack_unorm_4x8(result_dst, op[0]);
1971 break;
1972 case ir_unop_unpack_snorm_4x8:
1973 emit_unpack_snorm_4x8(result_dst, op[0]);
1974 break;
1975 case ir_unop_pack_unorm_4x8:
1976 emit_pack_unorm_4x8(result_dst, op[0]);
1977 break;
1978 case ir_unop_pack_snorm_4x8:
1979 emit_pack_snorm_4x8(result_dst, op[0]);
1980 break;
1981 case ir_unop_pack_snorm_2x16:
1982 case ir_unop_pack_unorm_2x16:
1983 case ir_unop_unpack_snorm_2x16:
1984 case ir_unop_unpack_unorm_2x16:
1985 unreachable("not reached: should be handled by lower_packing_builtins");
1986 case ir_unop_unpack_half_2x16_split_x:
1987 case ir_unop_unpack_half_2x16_split_y:
1988 case ir_binop_pack_half_2x16_split:
1989 case ir_unop_interpolate_at_centroid:
1990 case ir_binop_interpolate_at_sample:
1991 case ir_binop_interpolate_at_offset:
1992 unreachable("not reached: should not occur in vertex shader");
1993 case ir_binop_ldexp:
1994 unreachable("not reached: should be handled by ldexp_to_arith()");
1995 case ir_unop_d2f:
1996 case ir_unop_f2d:
1997 case ir_unop_d2i:
1998 case ir_unop_i2d:
1999 case ir_unop_d2u:
2000 case ir_unop_u2d:
2001 case ir_unop_d2b:
2002 case ir_unop_pack_double_2x32:
2003 case ir_unop_unpack_double_2x32:
2004 case ir_unop_frexp_sig:
2005 case ir_unop_frexp_exp:
2006 unreachable("fp64 todo");
2007 }
2008 }
2009
2010
2011 void
2012 vec4_visitor::visit(ir_swizzle *ir)
2013 {
2014 /* Note that this is only swizzles in expressions, not those on the left
2015 * hand side of an assignment, which do write masking. See ir_assignment
2016 * for that.
2017 */
2018 const unsigned swz = brw_compose_swizzle(
2019 brw_swizzle_for_size(ir->type->vector_elements),
2020 BRW_SWIZZLE4(ir->mask.x, ir->mask.y, ir->mask.z, ir->mask.w));
2021
2022 ir->val->accept(this);
2023 this->result = swizzle(this->result, swz);
2024 }
2025
2026 void
2027 vec4_visitor::visit(ir_dereference_variable *ir)
2028 {
2029 const struct glsl_type *type = ir->type;
2030 dst_reg *reg = variable_storage(ir->var);
2031
2032 if (!reg) {
2033 fail("Failed to find variable storage for %s\n", ir->var->name);
2034 this->result = src_reg(brw_null_reg());
2035 return;
2036 }
2037
2038 this->result = src_reg(*reg);
2039
2040 /* System values get their swizzle from the dst_reg writemask */
2041 if (ir->var->data.mode == ir_var_system_value)
2042 return;
2043
2044 if (type->is_scalar() || type->is_vector() || type->is_matrix())
2045 this->result.swizzle = brw_swizzle_for_size(type->vector_elements);
2046 }
2047
2048
2049 int
2050 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
2051 {
2052 /* Under normal circumstances array elements are stored consecutively, so
2053 * the stride is equal to the size of the array element.
2054 */
2055 return type_size_vec4(ir->type);
2056 }
2057
2058
2059 void
2060 vec4_visitor::visit(ir_dereference_array *ir)
2061 {
2062 ir_constant *constant_index;
2063 src_reg src;
2064 int array_stride = compute_array_stride(ir);
2065
2066 constant_index = ir->array_index->constant_expression_value();
2067
2068 ir->array->accept(this);
2069 src = this->result;
2070
2071 if (constant_index) {
2072 src.reg_offset += constant_index->value.i[0] * array_stride;
2073 } else {
2074 /* Variable index array dereference. It eats the "vec4" of the
2075 * base of the array and an index that offsets the Mesa register
2076 * index.
2077 */
2078 ir->array_index->accept(this);
2079
2080 src_reg index_reg;
2081
2082 if (array_stride == 1) {
2083 index_reg = this->result;
2084 } else {
2085 index_reg = src_reg(this, glsl_type::int_type);
2086
2087 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
2088 }
2089
2090 if (src.reladdr) {
2091 src_reg temp = src_reg(this, glsl_type::int_type);
2092
2093 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
2094
2095 index_reg = temp;
2096 }
2097
2098 src.reladdr = ralloc(mem_ctx, src_reg);
2099 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
2100 }
2101
2102 /* If the type is smaller than a vec4, replicate the last channel out. */
2103 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2104 src.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2105 else
2106 src.swizzle = BRW_SWIZZLE_NOOP;
2107 src.type = brw_type_for_base_type(ir->type);
2108
2109 this->result = src;
2110 }
2111
2112 void
2113 vec4_visitor::visit(ir_dereference_record *ir)
2114 {
2115 unsigned int i;
2116 const glsl_type *struct_type = ir->record->type;
2117 int offset = 0;
2118
2119 ir->record->accept(this);
2120
2121 for (i = 0; i < struct_type->length; i++) {
2122 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
2123 break;
2124 offset += type_size_vec4(struct_type->fields.structure[i].type);
2125 }
2126
2127 /* If the type is smaller than a vec4, replicate the last channel out. */
2128 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2129 this->result.swizzle = brw_swizzle_for_size(ir->type->vector_elements);
2130 else
2131 this->result.swizzle = BRW_SWIZZLE_NOOP;
2132 this->result.type = brw_type_for_base_type(ir->type);
2133
2134 this->result.reg_offset += offset;
2135 }
2136
2137 /**
2138 * We want to be careful in assignment setup to hit the actual storage
2139 * instead of potentially using a temporary like we might with the
2140 * ir_dereference handler.
2141 */
2142 static dst_reg
2143 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2144 {
2145 /* The LHS must be a dereference. If the LHS is a variable indexed array
2146 * access of a vector, it must be separated into a series conditional moves
2147 * before reaching this point (see ir_vec_index_to_cond_assign).
2148 */
2149 assert(ir->as_dereference());
2150 ir_dereference_array *deref_array = ir->as_dereference_array();
2151 if (deref_array) {
2152 assert(!deref_array->array->type->is_vector());
2153 }
2154
2155 /* Use the rvalue deref handler for the most part. We'll ignore
2156 * swizzles in it and write swizzles using writemask, though.
2157 */
2158 ir->accept(v);
2159 return dst_reg(v->result);
2160 }
2161
2162 void
2163 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2164 const struct glsl_type *type,
2165 enum brw_predicate predicate)
2166 {
2167 if (type->base_type == GLSL_TYPE_STRUCT) {
2168 for (unsigned int i = 0; i < type->length; i++) {
2169 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2170 }
2171 return;
2172 }
2173
2174 if (type->is_array()) {
2175 for (unsigned int i = 0; i < type->length; i++) {
2176 emit_block_move(dst, src, type->fields.array, predicate);
2177 }
2178 return;
2179 }
2180
2181 if (type->is_matrix()) {
2182 const struct glsl_type *vec_type;
2183
2184 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2185 type->vector_elements, 1);
2186
2187 for (int i = 0; i < type->matrix_columns; i++) {
2188 emit_block_move(dst, src, vec_type, predicate);
2189 }
2190 return;
2191 }
2192
2193 assert(type->is_scalar() || type->is_vector());
2194
2195 dst->type = brw_type_for_base_type(type);
2196 src->type = dst->type;
2197
2198 dst->writemask = (1 << type->vector_elements) - 1;
2199
2200 src->swizzle = brw_swizzle_for_size(type->vector_elements);
2201
2202 vec4_instruction *inst = emit(MOV(*dst, *src));
2203 inst->predicate = predicate;
2204
2205 dst->reg_offset++;
2206 src->reg_offset++;
2207 }
2208
2209
2210 /* If the RHS processing resulted in an instruction generating a
2211 * temporary value, and it would be easy to rewrite the instruction to
2212 * generate its result right into the LHS instead, do so. This ends
2213 * up reliably removing instructions where it can be tricky to do so
2214 * later without real UD chain information.
2215 */
2216 bool
2217 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2218 dst_reg dst,
2219 src_reg src,
2220 vec4_instruction *pre_rhs_inst,
2221 vec4_instruction *last_rhs_inst)
2222 {
2223 /* This could be supported, but it would take more smarts. */
2224 if (ir->condition)
2225 return false;
2226
2227 if (pre_rhs_inst == last_rhs_inst)
2228 return false; /* No instructions generated to work with. */
2229
2230 /* Make sure the last instruction generated our source reg. */
2231 if (src.file != GRF ||
2232 src.file != last_rhs_inst->dst.file ||
2233 src.reg != last_rhs_inst->dst.reg ||
2234 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2235 src.reladdr ||
2236 src.abs ||
2237 src.negate ||
2238 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2239 return false;
2240
2241 /* Check that that last instruction fully initialized the channels
2242 * we want to use, in the order we want to use them. We could
2243 * potentially reswizzle the operands of many instructions so that
2244 * we could handle out of order channels, but don't yet.
2245 */
2246
2247 for (unsigned i = 0; i < 4; i++) {
2248 if (dst.writemask & (1 << i)) {
2249 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2250 return false;
2251
2252 if (BRW_GET_SWZ(src.swizzle, i) != i)
2253 return false;
2254 }
2255 }
2256
2257 /* Success! Rewrite the instruction. */
2258 last_rhs_inst->dst.file = dst.file;
2259 last_rhs_inst->dst.reg = dst.reg;
2260 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2261 last_rhs_inst->dst.reladdr = dst.reladdr;
2262 last_rhs_inst->dst.writemask &= dst.writemask;
2263
2264 return true;
2265 }
2266
2267 void
2268 vec4_visitor::visit(ir_assignment *ir)
2269 {
2270 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2271 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2272
2273 if (!ir->lhs->type->is_scalar() &&
2274 !ir->lhs->type->is_vector()) {
2275 ir->rhs->accept(this);
2276 src_reg src = this->result;
2277
2278 if (ir->condition) {
2279 emit_bool_to_cond_code(ir->condition, &predicate);
2280 }
2281
2282 /* emit_block_move doesn't account for swizzles in the source register.
2283 * This should be ok, since the source register is a structure or an
2284 * array, and those can't be swizzled. But double-check to be sure.
2285 */
2286 assert(src.swizzle ==
2287 (ir->rhs->type->is_matrix()
2288 ? brw_swizzle_for_size(ir->rhs->type->vector_elements)
2289 : BRW_SWIZZLE_NOOP));
2290
2291 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2292 return;
2293 }
2294
2295 /* Now we're down to just a scalar/vector with writemasks. */
2296 int i;
2297
2298 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2299 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2300
2301 ir->rhs->accept(this);
2302
2303 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2304
2305 int swizzles[4];
2306 int src_chan = 0;
2307
2308 assert(ir->lhs->type->is_vector() ||
2309 ir->lhs->type->is_scalar());
2310 dst.writemask = ir->write_mask;
2311
2312 /* Swizzle a small RHS vector into the channels being written.
2313 *
2314 * glsl ir treats write_mask as dictating how many channels are
2315 * present on the RHS while in our instructions we need to make
2316 * those channels appear in the slots of the vec4 they're written to.
2317 */
2318 for (int i = 0; i < 4; i++)
2319 swizzles[i] = (ir->write_mask & (1 << i) ? src_chan++ : 0);
2320
2321 src_reg src = swizzle(this->result,
2322 BRW_SWIZZLE4(swizzles[0], swizzles[1],
2323 swizzles[2], swizzles[3]));
2324
2325 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2326 return;
2327 }
2328
2329 if (ir->condition) {
2330 emit_bool_to_cond_code(ir->condition, &predicate);
2331 }
2332
2333 for (i = 0; i < type_size_vec4(ir->lhs->type); i++) {
2334 vec4_instruction *inst = emit(MOV(dst, src));
2335 inst->predicate = predicate;
2336
2337 dst.reg_offset++;
2338 src.reg_offset++;
2339 }
2340 }
2341
2342 void
2343 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2344 {
2345 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2346 foreach_in_list(ir_constant, field_value, &ir->components) {
2347 emit_constant_values(dst, field_value);
2348 }
2349 return;
2350 }
2351
2352 if (ir->type->is_array()) {
2353 for (unsigned int i = 0; i < ir->type->length; i++) {
2354 emit_constant_values(dst, ir->array_elements[i]);
2355 }
2356 return;
2357 }
2358
2359 if (ir->type->is_matrix()) {
2360 for (int i = 0; i < ir->type->matrix_columns; i++) {
2361 float *vec = &ir->value.f[i * ir->type->vector_elements];
2362
2363 for (int j = 0; j < ir->type->vector_elements; j++) {
2364 dst->writemask = 1 << j;
2365 dst->type = BRW_REGISTER_TYPE_F;
2366
2367 emit(MOV(*dst, src_reg(vec[j])));
2368 }
2369 dst->reg_offset++;
2370 }
2371 return;
2372 }
2373
2374 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2375
2376 for (int i = 0; i < ir->type->vector_elements; i++) {
2377 if (!(remaining_writemask & (1 << i)))
2378 continue;
2379
2380 dst->writemask = 1 << i;
2381 dst->type = brw_type_for_base_type(ir->type);
2382
2383 /* Find other components that match the one we're about to
2384 * write. Emits fewer instructions for things like vec4(0.5,
2385 * 1.5, 1.5, 1.5).
2386 */
2387 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2388 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2389 if (ir->value.b[i] == ir->value.b[j])
2390 dst->writemask |= (1 << j);
2391 } else {
2392 /* u, i, and f storage all line up, so no need for a
2393 * switch case for comparing each type.
2394 */
2395 if (ir->value.u[i] == ir->value.u[j])
2396 dst->writemask |= (1 << j);
2397 }
2398 }
2399
2400 switch (ir->type->base_type) {
2401 case GLSL_TYPE_FLOAT:
2402 emit(MOV(*dst, src_reg(ir->value.f[i])));
2403 break;
2404 case GLSL_TYPE_INT:
2405 emit(MOV(*dst, src_reg(ir->value.i[i])));
2406 break;
2407 case GLSL_TYPE_UINT:
2408 emit(MOV(*dst, src_reg(ir->value.u[i])));
2409 break;
2410 case GLSL_TYPE_BOOL:
2411 emit(MOV(*dst, src_reg(ir->value.b[i] != 0 ? ~0 : 0)));
2412 break;
2413 default:
2414 unreachable("Non-float/uint/int/bool constant");
2415 }
2416
2417 remaining_writemask &= ~dst->writemask;
2418 }
2419 dst->reg_offset++;
2420 }
2421
2422 void
2423 vec4_visitor::visit(ir_constant *ir)
2424 {
2425 dst_reg dst = dst_reg(this, ir->type);
2426 this->result = src_reg(dst);
2427
2428 emit_constant_values(&dst, ir);
2429 }
2430
2431 void
2432 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2433 {
2434 ir_dereference *deref = static_cast<ir_dereference *>(
2435 ir->actual_parameters.get_head());
2436 ir_variable *location = deref->variable_referenced();
2437 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2438 location->data.binding);
2439
2440 /* Calculate the surface offset */
2441 src_reg offset(this, glsl_type::uint_type);
2442 ir_dereference_array *deref_array = deref->as_dereference_array();
2443 if (deref_array) {
2444 deref_array->array_index->accept(this);
2445
2446 src_reg tmp(this, glsl_type::uint_type);
2447 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2448 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2449 } else {
2450 offset = location->data.atomic.offset;
2451 }
2452
2453 /* Emit the appropriate machine instruction */
2454 const char *callee = ir->callee->function_name();
2455 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2456
2457 if (!strcmp("__intrinsic_atomic_read", callee)) {
2458 emit_untyped_surface_read(surf_index, dst, offset);
2459
2460 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2461 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2462 src_reg(), src_reg());
2463
2464 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2465 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2466 src_reg(), src_reg());
2467 }
2468
2469 brw_mark_surface_used(stage_prog_data, surf_index);
2470 }
2471
2472 void
2473 vec4_visitor::visit(ir_call *ir)
2474 {
2475 const char *callee = ir->callee->function_name();
2476
2477 if (!strcmp("__intrinsic_atomic_read", callee) ||
2478 !strcmp("__intrinsic_atomic_increment", callee) ||
2479 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2480 visit_atomic_counter_intrinsic(ir);
2481 } else {
2482 unreachable("Unsupported intrinsic.");
2483 }
2484 }
2485
2486 src_reg
2487 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
2488 src_reg coordinate, src_reg sampler)
2489 {
2490 vec4_instruction *inst =
2491 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
2492 dst_reg(this, glsl_type::uvec4_type));
2493 inst->base_mrf = 2;
2494 inst->src[1] = sampler;
2495
2496 int param_base;
2497
2498 if (devinfo->gen >= 9) {
2499 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
2500 vec4_instruction *header_inst = new(mem_ctx)
2501 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
2502 dst_reg(MRF, inst->base_mrf));
2503
2504 emit(header_inst);
2505
2506 inst->mlen = 2;
2507 inst->header_size = 1;
2508 param_base = inst->base_mrf + 1;
2509 } else {
2510 inst->mlen = 1;
2511 param_base = inst->base_mrf;
2512 }
2513
2514 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2515 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
2516 int zero_mask = 0xf & ~coord_mask;
2517
2518 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
2519 coordinate));
2520
2521 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
2522 src_reg(0)));
2523
2524 emit(inst);
2525 return src_reg(inst->dst);
2526 }
2527
2528 bool
2529 vec4_visitor::is_high_sampler(src_reg sampler)
2530 {
2531 if (devinfo->gen < 8 && !devinfo->is_haswell)
2532 return false;
2533
2534 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2535 }
2536
2537 void
2538 vec4_visitor::emit_texture(ir_texture_opcode op,
2539 dst_reg dest,
2540 const glsl_type *dest_type,
2541 src_reg coordinate,
2542 int coord_components,
2543 src_reg shadow_comparitor,
2544 src_reg lod, src_reg lod2,
2545 src_reg sample_index,
2546 uint32_t constant_offset,
2547 src_reg offset_value,
2548 src_reg mcs,
2549 bool is_cube_array,
2550 uint32_t sampler,
2551 src_reg sampler_reg)
2552 {
2553 enum opcode opcode;
2554 switch (op) {
2555 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2556 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2557 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2558 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2559 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2560 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2561 case ir_tg4: opcode = offset_value.file != BAD_FILE
2562 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2563 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2564 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
2565 case ir_txb:
2566 unreachable("TXB is not valid for vertex shaders.");
2567 case ir_lod:
2568 unreachable("LOD is not valid for vertex shaders.");
2569 default:
2570 unreachable("Unrecognized tex op");
2571 }
2572
2573 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
2574 opcode, dst_reg(this, dest_type));
2575
2576 inst->offset = constant_offset;
2577
2578 /* The message header is necessary for:
2579 * - Gen4 (always)
2580 * - Gen9+ for selecting SIMD4x2
2581 * - Texel offsets
2582 * - Gather channel selection
2583 * - Sampler indices too large to fit in a 4-bit value.
2584 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
2585 */
2586 inst->header_size =
2587 (devinfo->gen < 5 || devinfo->gen >= 9 ||
2588 inst->offset != 0 || op == ir_tg4 ||
2589 op == ir_texture_samples ||
2590 is_high_sampler(sampler_reg)) ? 1 : 0;
2591 inst->base_mrf = 2;
2592 inst->mlen = inst->header_size;
2593 inst->dst.writemask = WRITEMASK_XYZW;
2594 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
2595
2596 inst->src[1] = sampler_reg;
2597
2598 /* MRF for the first parameter */
2599 int param_base = inst->base_mrf + inst->header_size;
2600
2601 if (op == ir_txs || op == ir_query_levels) {
2602 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2603 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
2604 inst->mlen++;
2605 } else if (op == ir_texture_samples) {
2606 inst->dst.writemask = WRITEMASK_X;
2607 } else {
2608 /* Load the coordinate */
2609 /* FINISHME: gl_clamp_mask and saturate */
2610 int coord_mask = (1 << coord_components) - 1;
2611 int zero_mask = 0xf & ~coord_mask;
2612
2613 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
2614 coordinate));
2615 inst->mlen++;
2616
2617 if (zero_mask != 0) {
2618 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
2619 src_reg(0)));
2620 }
2621 /* Load the shadow comparitor */
2622 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
2623 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
2624 WRITEMASK_X),
2625 shadow_comparitor));
2626 inst->mlen++;
2627 }
2628
2629 /* Load the LOD info */
2630 if (op == ir_tex || op == ir_txl) {
2631 int mrf, writemask;
2632 if (devinfo->gen >= 5) {
2633 mrf = param_base + 1;
2634 if (shadow_comparitor.file != BAD_FILE) {
2635 writemask = WRITEMASK_Y;
2636 /* mlen already incremented */
2637 } else {
2638 writemask = WRITEMASK_X;
2639 inst->mlen++;
2640 }
2641 } else /* devinfo->gen == 4 */ {
2642 mrf = param_base;
2643 writemask = WRITEMASK_W;
2644 }
2645 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
2646 } else if (op == ir_txf) {
2647 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
2648 } else if (op == ir_txf_ms) {
2649 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
2650 sample_index));
2651 if (devinfo->gen >= 7) {
2652 /* MCS data is in the first channel of `mcs`, but we need to get it into
2653 * the .y channel of the second vec4 of params, so replicate .x across
2654 * the whole vec4 and then mask off everything except .y
2655 */
2656 mcs.swizzle = BRW_SWIZZLE_XXXX;
2657 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2658 mcs));
2659 }
2660 inst->mlen++;
2661 } else if (op == ir_txd) {
2662 const brw_reg_type type = lod.type;
2663
2664 if (devinfo->gen >= 5) {
2665 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2666 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2667 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
2668 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
2669 inst->mlen++;
2670
2671 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
2672 lod.swizzle = BRW_SWIZZLE_ZZZZ;
2673 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
2674 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
2675 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
2676 inst->mlen++;
2677
2678 if (shadow_comparitor.file != BAD_FILE) {
2679 emit(MOV(dst_reg(MRF, param_base + 2,
2680 shadow_comparitor.type, WRITEMASK_Z),
2681 shadow_comparitor));
2682 }
2683 }
2684 } else /* devinfo->gen == 4 */ {
2685 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
2686 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
2687 inst->mlen += 2;
2688 }
2689 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
2690 if (shadow_comparitor.file != BAD_FILE) {
2691 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
2692 shadow_comparitor));
2693 }
2694
2695 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2696 offset_value));
2697 inst->mlen++;
2698 }
2699 }
2700
2701 emit(inst);
2702
2703 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2704 * spec requires layers.
2705 */
2706 if (op == ir_txs && is_cube_array) {
2707 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2708 writemask(inst->dst, WRITEMASK_Z),
2709 src_reg(inst->dst), src_reg(6));
2710 }
2711
2712 if (devinfo->gen == 6 && op == ir_tg4) {
2713 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
2714 }
2715
2716 swizzle_result(op, dest,
2717 src_reg(inst->dst), sampler, dest_type);
2718 }
2719
2720 void
2721 vec4_visitor::visit(ir_texture *ir)
2722 {
2723 uint32_t sampler =
2724 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2725
2726 ir_rvalue *nonconst_sampler_index =
2727 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2728
2729 /* Handle non-constant sampler array indexing */
2730 src_reg sampler_reg;
2731 if (nonconst_sampler_index) {
2732 /* The highest sampler which may be used by this operation is
2733 * the last element of the array. Mark it here, because the generator
2734 * doesn't have enough information to determine the bound.
2735 */
2736 uint32_t array_size = ir->sampler->as_dereference_array()
2737 ->array->type->array_size();
2738
2739 uint32_t max_used = sampler + array_size - 1;
2740 if (ir->op == ir_tg4 && devinfo->gen < 8) {
2741 max_used += prog_data->base.binding_table.gather_texture_start;
2742 } else {
2743 max_used += prog_data->base.binding_table.texture_start;
2744 }
2745
2746 brw_mark_surface_used(&prog_data->base, max_used);
2747
2748 /* Emit code to evaluate the actual indexing expression */
2749 nonconst_sampler_index->accept(this);
2750 src_reg temp(this, glsl_type::uint_type);
2751 emit(ADD(dst_reg(temp), this->result, src_reg(sampler)));
2752 sampler_reg = emit_uniformize(temp);
2753 } else {
2754 /* Single sampler, or constant array index; the indexing expression
2755 * is just an immediate.
2756 */
2757 sampler_reg = src_reg(sampler);
2758 }
2759
2760 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2761 * emitting anything other than setting up the constant result.
2762 */
2763 if (ir->op == ir_tg4) {
2764 ir_constant *chan = ir->lod_info.component->as_constant();
2765 int swiz = GET_SWZ(key_tex->swizzles[sampler], chan->value.i[0]);
2766 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2767 dst_reg result(this, ir->type);
2768 this->result = src_reg(result);
2769 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2770 return;
2771 }
2772 }
2773
2774 /* Should be lowered by do_lower_texture_projection */
2775 assert(!ir->projector);
2776
2777 /* Should be lowered */
2778 assert(!ir->offset || !ir->offset->type->is_array());
2779
2780 /* Generate code to compute all the subexpression trees. This has to be
2781 * done before loading any values into MRFs for the sampler message since
2782 * generating these values may involve SEND messages that need the MRFs.
2783 */
2784 src_reg coordinate;
2785 int coord_components = 0;
2786 if (ir->coordinate) {
2787 coord_components = ir->coordinate->type->vector_elements;
2788 ir->coordinate->accept(this);
2789 coordinate = this->result;
2790 }
2791
2792 src_reg shadow_comparitor;
2793 if (ir->shadow_comparitor) {
2794 ir->shadow_comparitor->accept(this);
2795 shadow_comparitor = this->result;
2796 }
2797
2798 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2799 src_reg offset_value;
2800 if (has_nonconstant_offset) {
2801 ir->offset->accept(this);
2802 offset_value = src_reg(this->result);
2803 }
2804
2805 src_reg lod, lod2, sample_index, mcs;
2806 switch (ir->op) {
2807 case ir_tex:
2808 lod = src_reg(0.0f);
2809 break;
2810 case ir_txf:
2811 case ir_txl:
2812 case ir_txs:
2813 ir->lod_info.lod->accept(this);
2814 lod = this->result;
2815 break;
2816 case ir_query_levels:
2817 lod = src_reg(0);
2818 break;
2819 case ir_txf_ms:
2820 ir->lod_info.sample_index->accept(this);
2821 sample_index = this->result;
2822
2823 if (devinfo->gen >= 7 && key_tex->compressed_multisample_layout_mask & (1 << sampler))
2824 mcs = emit_mcs_fetch(ir->coordinate->type, coordinate, sampler_reg);
2825 else
2826 mcs = src_reg(0u);
2827 break;
2828 case ir_txd:
2829 ir->lod_info.grad.dPdx->accept(this);
2830 lod = this->result;
2831
2832 ir->lod_info.grad.dPdy->accept(this);
2833 lod2 = this->result;
2834 break;
2835 case ir_txb:
2836 case ir_lod:
2837 case ir_tg4:
2838 case ir_texture_samples:
2839 break;
2840 }
2841
2842 uint32_t constant_offset = 0;
2843 if (ir->offset != NULL && !has_nonconstant_offset) {
2844 constant_offset =
2845 brw_texture_offset(ir->offset->as_constant()->value.i,
2846 ir->offset->type->vector_elements);
2847 }
2848
2849 /* Stuff the channel select bits in the top of the texture offset */
2850 if (ir->op == ir_tg4)
2851 constant_offset |=
2852 gather_channel( ir->lod_info.component->as_constant()->value.i[0],
2853 sampler) << 16;
2854
2855 glsl_type const *type = ir->sampler->type;
2856 bool is_cube_array = type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2857 type->sampler_array;
2858
2859 this->result = src_reg(this, ir->type);
2860 dst_reg dest = dst_reg(this->result);
2861
2862 emit_texture(ir->op, dest, ir->type, coordinate, coord_components,
2863 shadow_comparitor,
2864 lod, lod2, sample_index,
2865 constant_offset, offset_value,
2866 mcs, is_cube_array, sampler, sampler_reg);
2867 }
2868
2869 /**
2870 * Apply workarounds for Gen6 gather with UINT/SINT
2871 */
2872 void
2873 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2874 {
2875 if (!wa)
2876 return;
2877
2878 int width = (wa & WA_8BIT) ? 8 : 16;
2879 dst_reg dst_f = dst;
2880 dst_f.type = BRW_REGISTER_TYPE_F;
2881
2882 /* Convert from UNORM to UINT */
2883 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2884 emit(MOV(dst, src_reg(dst_f)));
2885
2886 if (wa & WA_SIGN) {
2887 /* Reinterpret the UINT value as a signed INT value by
2888 * shifting the sign bit into place, then shifting back
2889 * preserving sign.
2890 */
2891 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2892 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2893 }
2894 }
2895
2896 /**
2897 * Set up the gather channel based on the swizzle, for gather4.
2898 */
2899 uint32_t
2900 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
2901 {
2902 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
2903 switch (swiz) {
2904 case SWIZZLE_X: return 0;
2905 case SWIZZLE_Y:
2906 /* gather4 sampler is broken for green channel on RG32F --
2907 * we must ask for blue instead.
2908 */
2909 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
2910 return 2;
2911 return 1;
2912 case SWIZZLE_Z: return 2;
2913 case SWIZZLE_W: return 3;
2914 default:
2915 unreachable("Not reached"); /* zero, one swizzles handled already */
2916 }
2917 }
2918
2919 void
2920 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
2921 src_reg orig_val, uint32_t sampler,
2922 const glsl_type *dest_type)
2923 {
2924 int s = key_tex->swizzles[sampler];
2925
2926 dst_reg swizzled_result = dest;
2927
2928 if (op == ir_query_levels) {
2929 /* # levels is in .w */
2930 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2931 emit(MOV(swizzled_result, orig_val));
2932 return;
2933 }
2934
2935 if (op == ir_txs || dest_type == glsl_type::float_type
2936 || s == SWIZZLE_NOOP || op == ir_tg4) {
2937 emit(MOV(swizzled_result, orig_val));
2938 return;
2939 }
2940
2941
2942 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2943 int swizzle[4] = {0};
2944
2945 for (int i = 0; i < 4; i++) {
2946 switch (GET_SWZ(s, i)) {
2947 case SWIZZLE_ZERO:
2948 zero_mask |= (1 << i);
2949 break;
2950 case SWIZZLE_ONE:
2951 one_mask |= (1 << i);
2952 break;
2953 default:
2954 copy_mask |= (1 << i);
2955 swizzle[i] = GET_SWZ(s, i);
2956 break;
2957 }
2958 }
2959
2960 if (copy_mask) {
2961 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2962 swizzled_result.writemask = copy_mask;
2963 emit(MOV(swizzled_result, orig_val));
2964 }
2965
2966 if (zero_mask) {
2967 swizzled_result.writemask = zero_mask;
2968 emit(MOV(swizzled_result, src_reg(0.0f)));
2969 }
2970
2971 if (one_mask) {
2972 swizzled_result.writemask = one_mask;
2973 emit(MOV(swizzled_result, src_reg(1.0f)));
2974 }
2975 }
2976
2977 void
2978 vec4_visitor::visit(ir_return *)
2979 {
2980 unreachable("not reached");
2981 }
2982
2983 void
2984 vec4_visitor::visit(ir_discard *)
2985 {
2986 unreachable("not reached");
2987 }
2988
2989 void
2990 vec4_visitor::visit(ir_if *ir)
2991 {
2992 /* Don't point the annotation at the if statement, because then it plus
2993 * the then and else blocks get printed.
2994 */
2995 this->base_ir = ir->condition;
2996
2997 if (devinfo->gen == 6) {
2998 emit_if_gen6(ir);
2999 } else {
3000 enum brw_predicate predicate;
3001 emit_bool_to_cond_code(ir->condition, &predicate);
3002 emit(IF(predicate));
3003 }
3004
3005 visit_instructions(&ir->then_instructions);
3006
3007 if (!ir->else_instructions.is_empty()) {
3008 this->base_ir = ir->condition;
3009 emit(BRW_OPCODE_ELSE);
3010
3011 visit_instructions(&ir->else_instructions);
3012 }
3013
3014 this->base_ir = ir->condition;
3015 emit(BRW_OPCODE_ENDIF);
3016 }
3017
3018 void
3019 vec4_visitor::gs_emit_vertex(int stream_id)
3020 {
3021 unreachable("not reached");
3022 }
3023
3024 void
3025 vec4_visitor::visit(ir_emit_vertex *)
3026 {
3027 unreachable("not reached");
3028 }
3029
3030 void
3031 vec4_visitor::gs_end_primitive()
3032 {
3033 unreachable("not reached");
3034 }
3035
3036
3037 void
3038 vec4_visitor::visit(ir_end_primitive *)
3039 {
3040 unreachable("not reached");
3041 }
3042
3043 void
3044 vec4_visitor::visit(ir_barrier *)
3045 {
3046 unreachable("not reached");
3047 }
3048
3049 void
3050 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
3051 dst_reg dst, src_reg offset,
3052 src_reg src0, src_reg src1)
3053 {
3054 unsigned mlen = 0;
3055
3056 /* Set the atomic operation offset. */
3057 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
3058 mlen++;
3059
3060 /* Set the atomic operation arguments. */
3061 if (src0.file != BAD_FILE) {
3062 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
3063 mlen++;
3064 }
3065
3066 if (src1.file != BAD_FILE) {
3067 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
3068 mlen++;
3069 }
3070
3071 /* Emit the instruction. Note that this maps to the normal SIMD8
3072 * untyped atomic message on Ivy Bridge, but that's OK because
3073 * unused channels will be masked out.
3074 */
3075 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
3076 brw_message_reg(0),
3077 src_reg(surf_index), src_reg(atomic_op));
3078 inst->mlen = mlen;
3079 }
3080
3081 void
3082 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
3083 src_reg offset)
3084 {
3085 /* Set the surface read offset. */
3086 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
3087
3088 /* Emit the instruction. Note that this maps to the normal SIMD8
3089 * untyped surface read message, but that's OK because unused
3090 * channels will be masked out.
3091 */
3092 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
3093 brw_message_reg(0),
3094 src_reg(surf_index), src_reg(1));
3095 inst->mlen = 1;
3096 }
3097
3098 void
3099 vec4_visitor::emit_ndc_computation()
3100 {
3101 /* Get the position */
3102 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
3103
3104 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
3105 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
3106 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
3107
3108 current_annotation = "NDC";
3109 dst_reg ndc_w = ndc;
3110 ndc_w.writemask = WRITEMASK_W;
3111 src_reg pos_w = pos;
3112 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
3113 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
3114
3115 dst_reg ndc_xyz = ndc;
3116 ndc_xyz.writemask = WRITEMASK_XYZ;
3117
3118 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
3119 }
3120
3121 void
3122 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
3123 {
3124 if (devinfo->gen < 6 &&
3125 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
3126 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
3127 devinfo->has_negative_rhw_bug)) {
3128 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
3129 dst_reg header1_w = header1;
3130 header1_w.writemask = WRITEMASK_W;
3131
3132 emit(MOV(header1, 0u));
3133
3134 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3135 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3136
3137 current_annotation = "Point size";
3138 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
3139 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
3140 }
3141
3142 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
3143 current_annotation = "Clipping flags";
3144 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
3145 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
3146
3147 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
3148 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
3149 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
3150
3151 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
3152 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
3153 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
3154 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
3155 }
3156
3157 /* i965 clipping workaround:
3158 * 1) Test for -ve rhw
3159 * 2) If set,
3160 * set ndc = (0,0,0,0)
3161 * set ucp[6] = 1
3162 *
3163 * Later, clipping will detect ucp[6] and ensure the primitive is
3164 * clipped against all fixed planes.
3165 */
3166 if (devinfo->has_negative_rhw_bug) {
3167 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
3168 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
3169 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
3170 vec4_instruction *inst;
3171 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
3172 inst->predicate = BRW_PREDICATE_NORMAL;
3173 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
3174 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
3175 inst->predicate = BRW_PREDICATE_NORMAL;
3176 }
3177
3178 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
3179 } else if (devinfo->gen < 6) {
3180 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
3181 } else {
3182 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
3183 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
3184 dst_reg reg_w = reg;
3185 reg_w.writemask = WRITEMASK_W;
3186 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
3187 reg_as_src.type = reg_w.type;
3188 reg_as_src.swizzle = brw_swizzle_for_size(1);
3189 emit(MOV(reg_w, reg_as_src));
3190 }
3191 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3192 dst_reg reg_y = reg;
3193 reg_y.writemask = WRITEMASK_Y;
3194 reg_y.type = BRW_REGISTER_TYPE_D;
3195 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
3196 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3197 }
3198 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3199 dst_reg reg_z = reg;
3200 reg_z.writemask = WRITEMASK_Z;
3201 reg_z.type = BRW_REGISTER_TYPE_D;
3202 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
3203 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3204 }
3205 }
3206 }
3207
3208 vec4_instruction *
3209 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3210 {
3211 assert(varying < VARYING_SLOT_MAX);
3212 assert(output_reg[varying].type == reg.type);
3213 current_annotation = output_reg_annotation[varying];
3214 /* Copy the register, saturating if necessary */
3215 return emit(MOV(reg, src_reg(output_reg[varying])));
3216 }
3217
3218 void
3219 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3220 {
3221 reg.type = BRW_REGISTER_TYPE_F;
3222 output_reg[varying].type = reg.type;
3223
3224 switch (varying) {
3225 case VARYING_SLOT_PSIZ:
3226 {
3227 /* PSIZ is always in slot 0, and is coupled with other flags. */
3228 current_annotation = "indices, point width, clip flags";
3229 emit_psiz_and_flags(reg);
3230 break;
3231 }
3232 case BRW_VARYING_SLOT_NDC:
3233 current_annotation = "NDC";
3234 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3235 break;
3236 case VARYING_SLOT_POS:
3237 current_annotation = "gl_Position";
3238 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3239 break;
3240 case VARYING_SLOT_EDGE:
3241 /* This is present when doing unfilled polygons. We're supposed to copy
3242 * the edge flag from the user-provided vertex array
3243 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3244 * of that attribute (starts as 1.0f). This is then used in clipping to
3245 * determine which edges should be drawn as wireframe.
3246 */
3247 current_annotation = "edge flag";
3248 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3249 glsl_type::float_type, WRITEMASK_XYZW))));
3250 break;
3251 case BRW_VARYING_SLOT_PAD:
3252 /* No need to write to this slot */
3253 break;
3254 default:
3255 emit_generic_urb_slot(reg, varying);
3256 break;
3257 }
3258 }
3259
3260 static int
3261 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
3262 {
3263 if (devinfo->gen >= 6) {
3264 /* URB data written (does not include the message header reg) must
3265 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3266 * section 5.4.3.2.2: URB_INTERLEAVED.
3267 *
3268 * URB entries are allocated on a multiple of 1024 bits, so an
3269 * extra 128 bits written here to make the end align to 256 is
3270 * no problem.
3271 */
3272 if ((mlen % 2) != 1)
3273 mlen++;
3274 }
3275
3276 return mlen;
3277 }
3278
3279
3280 /**
3281 * Generates the VUE payload plus the necessary URB write instructions to
3282 * output it.
3283 *
3284 * The VUE layout is documented in Volume 2a.
3285 */
3286 void
3287 vec4_visitor::emit_vertex()
3288 {
3289 /* MRF 0 is reserved for the debugger, so start with message header
3290 * in MRF 1.
3291 */
3292 int base_mrf = 1;
3293 int mrf = base_mrf;
3294 /* In the process of generating our URB write message contents, we
3295 * may need to unspill a register or load from an array. Those
3296 * reads would use MRFs 14-15.
3297 */
3298 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
3299
3300 /* The following assertion verifies that max_usable_mrf causes an
3301 * even-numbered amount of URB write data, which will meet gen6's
3302 * requirements for length alignment.
3303 */
3304 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3305
3306 /* First mrf is the g0-based message header containing URB handles and
3307 * such.
3308 */
3309 emit_urb_write_header(mrf++);
3310
3311 if (devinfo->gen < 6) {
3312 emit_ndc_computation();
3313 }
3314
3315 /* We may need to split this up into several URB writes, so do them in a
3316 * loop.
3317 */
3318 int slot = 0;
3319 bool complete = false;
3320 do {
3321 /* URB offset is in URB row increments, and each of our MRFs is half of
3322 * one of those, since we're doing interleaved writes.
3323 */
3324 int offset = slot / 2;
3325
3326 mrf = base_mrf + 1;
3327 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3328 emit_urb_slot(dst_reg(MRF, mrf++),
3329 prog_data->vue_map.slot_to_varying[slot]);
3330
3331 /* If this was max_usable_mrf, we can't fit anything more into this
3332 * URB WRITE. Same thing if we reached the maximum length available.
3333 */
3334 if (mrf > max_usable_mrf ||
3335 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
3336 slot++;
3337 break;
3338 }
3339 }
3340
3341 complete = slot >= prog_data->vue_map.num_slots;
3342 current_annotation = "URB write";
3343 vec4_instruction *inst = emit_urb_write_opcode(complete);
3344 inst->base_mrf = base_mrf;
3345 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
3346 inst->offset += offset;
3347 } while(!complete);
3348 }
3349
3350
3351 src_reg
3352 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3353 src_reg *reladdr, int reg_offset)
3354 {
3355 /* Because we store the values to scratch interleaved like our
3356 * vertex data, we need to scale the vec4 index by 2.
3357 */
3358 int message_header_scale = 2;
3359
3360 /* Pre-gen6, the message header uses byte offsets instead of vec4
3361 * (16-byte) offset units.
3362 */
3363 if (devinfo->gen < 6)
3364 message_header_scale *= 16;
3365
3366 if (reladdr) {
3367 src_reg index = src_reg(this, glsl_type::int_type);
3368
3369 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3370 src_reg(reg_offset)));
3371 emit_before(block, inst, MUL(dst_reg(index), index,
3372 src_reg(message_header_scale)));
3373
3374 return index;
3375 } else {
3376 return src_reg(reg_offset * message_header_scale);
3377 }
3378 }
3379
3380 src_reg
3381 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3382 src_reg *reladdr, int reg_offset)
3383 {
3384 if (reladdr) {
3385 src_reg index = src_reg(this, glsl_type::int_type);
3386
3387 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3388 src_reg(reg_offset)));
3389
3390 /* Pre-gen6, the message header uses byte offsets instead of vec4
3391 * (16-byte) offset units.
3392 */
3393 if (devinfo->gen < 6) {
3394 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3395 }
3396
3397 return index;
3398 } else if (devinfo->gen >= 8) {
3399 /* Store the offset in a GRF so we can send-from-GRF. */
3400 src_reg offset = src_reg(this, glsl_type::int_type);
3401 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3402 return offset;
3403 } else {
3404 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
3405 return src_reg(reg_offset * message_header_scale);
3406 }
3407 }
3408
3409 /**
3410 * Emits an instruction before @inst to load the value named by @orig_src
3411 * from scratch space at @base_offset to @temp.
3412 *
3413 * @base_offset is measured in 32-byte units (the size of a register).
3414 */
3415 void
3416 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3417 dst_reg temp, src_reg orig_src,
3418 int base_offset)
3419 {
3420 int reg_offset = base_offset + orig_src.reg_offset;
3421 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3422 reg_offset);
3423
3424 emit_before(block, inst, SCRATCH_READ(temp, index));
3425 }
3426
3427 /**
3428 * Emits an instruction after @inst to store the value to be written
3429 * to @orig_dst to scratch space at @base_offset, from @temp.
3430 *
3431 * @base_offset is measured in 32-byte units (the size of a register).
3432 */
3433 void
3434 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3435 int base_offset)
3436 {
3437 int reg_offset = base_offset + inst->dst.reg_offset;
3438 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3439 reg_offset);
3440
3441 /* Create a temporary register to store *inst's result in.
3442 *
3443 * We have to be careful in MOVing from our temporary result register in
3444 * the scratch write. If we swizzle from channels of the temporary that
3445 * weren't initialized, it will confuse live interval analysis, which will
3446 * make spilling fail to make progress.
3447 */
3448 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
3449 inst->dst.type),
3450 brw_swizzle_for_mask(inst->dst.writemask));
3451 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3452 inst->dst.writemask));
3453 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3454 if (inst->opcode != BRW_OPCODE_SEL)
3455 write->predicate = inst->predicate;
3456 write->ir = inst->ir;
3457 write->annotation = inst->annotation;
3458 inst->insert_after(block, write);
3459
3460 inst->dst.file = temp.file;
3461 inst->dst.reg = temp.reg;
3462 inst->dst.reg_offset = temp.reg_offset;
3463 inst->dst.reladdr = NULL;
3464 }
3465
3466 /**
3467 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
3468 * adds the scratch read(s) before \p inst. The function also checks for
3469 * recursive reladdr scratch accesses, issuing the corresponding scratch
3470 * loads and rewriting reladdr references accordingly.
3471 *
3472 * \return \p src if it did not require a scratch load, otherwise, the
3473 * register holding the result of the scratch load that the caller should
3474 * use to rewrite src.
3475 */
3476 src_reg
3477 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
3478 vec4_instruction *inst, src_reg src)
3479 {
3480 /* Resolve recursive reladdr scratch access by calling ourselves
3481 * with src.reladdr
3482 */
3483 if (src.reladdr)
3484 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3485 *src.reladdr);
3486
3487 /* Now handle scratch access on src */
3488 if (src.file == GRF && scratch_loc[src.reg] != -1) {
3489 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3490 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
3491 src.reg = temp.reg;
3492 src.reg_offset = temp.reg_offset;
3493 src.reladdr = NULL;
3494 }
3495
3496 return src;
3497 }
3498
3499 /**
3500 * We can't generally support array access in GRF space, because a
3501 * single instruction's destination can only span 2 contiguous
3502 * registers. So, we send all GRF arrays that get variable index
3503 * access to scratch space.
3504 */
3505 void
3506 vec4_visitor::move_grf_array_access_to_scratch()
3507 {
3508 int scratch_loc[this->alloc.count];
3509 memset(scratch_loc, -1, sizeof(scratch_loc));
3510
3511 /* First, calculate the set of virtual GRFs that need to be punted
3512 * to scratch due to having any array access on them, and where in
3513 * scratch.
3514 */
3515 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3516 if (inst->dst.file == GRF && inst->dst.reladdr) {
3517 if (scratch_loc[inst->dst.reg] == -1) {
3518 scratch_loc[inst->dst.reg] = last_scratch;
3519 last_scratch += this->alloc.sizes[inst->dst.reg];
3520 }
3521
3522 for (src_reg *iter = inst->dst.reladdr;
3523 iter->reladdr;
3524 iter = iter->reladdr) {
3525 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3526 scratch_loc[iter->reg] = last_scratch;
3527 last_scratch += this->alloc.sizes[iter->reg];
3528 }
3529 }
3530 }
3531
3532 for (int i = 0 ; i < 3; i++) {
3533 for (src_reg *iter = &inst->src[i];
3534 iter->reladdr;
3535 iter = iter->reladdr) {
3536 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
3537 scratch_loc[iter->reg] = last_scratch;
3538 last_scratch += this->alloc.sizes[iter->reg];
3539 }
3540 }
3541 }
3542 }
3543
3544 /* Now, for anything that will be accessed through scratch, rewrite
3545 * it to load/store. Note that this is a _safe list walk, because
3546 * we may generate a new scratch_write instruction after the one
3547 * we're processing.
3548 */
3549 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3550 /* Set up the annotation tracking for new generated instructions. */
3551 base_ir = inst->ir;
3552 current_annotation = inst->annotation;
3553
3554 /* First handle scratch access on the dst. Notice we have to handle
3555 * the case where the dst's reladdr also points to scratch space.
3556 */
3557 if (inst->dst.reladdr)
3558 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
3559 *inst->dst.reladdr);
3560
3561 /* Now that we have handled any (possibly recursive) reladdr scratch
3562 * accesses for dst we can safely do the scratch write for dst itself
3563 */
3564 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
3565 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3566
3567 /* Now handle scratch access on any src. In this case, since inst->src[i]
3568 * already is a src_reg, we can just call emit_resolve_reladdr with
3569 * inst->src[i] and it will take care of handling scratch loads for
3570 * both src and src.reladdr (recursively).
3571 */
3572 for (int i = 0 ; i < 3; i++) {
3573 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
3574 inst->src[i]);
3575 }
3576 }
3577 }
3578
3579 /**
3580 * Emits an instruction before @inst to load the value named by @orig_src
3581 * from the pull constant buffer (surface) at @base_offset to @temp.
3582 */
3583 void
3584 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3585 dst_reg temp, src_reg orig_src,
3586 int base_offset)
3587 {
3588 int reg_offset = base_offset + orig_src.reg_offset;
3589 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3590 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3591 reg_offset);
3592
3593 emit_pull_constant_load_reg(temp,
3594 index,
3595 offset,
3596 block, inst);
3597 }
3598
3599 /**
3600 * Implements array access of uniforms by inserting a
3601 * PULL_CONSTANT_LOAD instruction.
3602 *
3603 * Unlike temporary GRF array access (where we don't support it due to
3604 * the difficulty of doing relative addressing on instruction
3605 * destinations), we could potentially do array access of uniforms
3606 * that were loaded in GRF space as push constants. In real-world
3607 * usage we've seen, though, the arrays being used are always larger
3608 * than we could load as push constants, so just always move all
3609 * uniform array access out to a pull constant buffer.
3610 */
3611 void
3612 vec4_visitor::move_uniform_array_access_to_pull_constants()
3613 {
3614 int pull_constant_loc[this->uniforms];
3615 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3616 bool nested_reladdr;
3617
3618 /* Walk through and find array access of uniforms. Put a copy of that
3619 * uniform in the pull constant buffer.
3620 *
3621 * Note that we don't move constant-indexed accesses to arrays. No
3622 * testing has been done of the performance impact of this choice.
3623 */
3624 do {
3625 nested_reladdr = false;
3626
3627 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3628 for (int i = 0 ; i < 3; i++) {
3629 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3630 continue;
3631
3632 int uniform = inst->src[i].reg;
3633
3634 if (inst->src[i].reladdr->reladdr)
3635 nested_reladdr = true; /* will need another pass */
3636
3637 /* If this array isn't already present in the pull constant buffer,
3638 * add it.
3639 */
3640 if (pull_constant_loc[uniform] == -1) {
3641 const gl_constant_value **values =
3642 &stage_prog_data->param[uniform * 4];
3643
3644 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3645
3646 assert(uniform < uniform_array_size);
3647 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3648 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3649 = values[j];
3650 }
3651 }
3652
3653 /* Set up the annotation tracking for new generated instructions. */
3654 base_ir = inst->ir;
3655 current_annotation = inst->annotation;
3656
3657 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3658
3659 emit_pull_constant_load(block, inst, temp, inst->src[i],
3660 pull_constant_loc[uniform]);
3661
3662 inst->src[i].file = temp.file;
3663 inst->src[i].reg = temp.reg;
3664 inst->src[i].reg_offset = temp.reg_offset;
3665 inst->src[i].reladdr = NULL;
3666 }
3667 }
3668 } while (nested_reladdr);
3669
3670 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3671 * no need to track them as larger-than-vec4 objects. This will be
3672 * relied on in cutting out unused uniform vectors from push
3673 * constants.
3674 */
3675 split_uniform_registers();
3676 }
3677
3678 void
3679 vec4_visitor::resolve_ud_negate(src_reg *reg)
3680 {
3681 if (reg->type != BRW_REGISTER_TYPE_UD ||
3682 !reg->negate)
3683 return;
3684
3685 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3686 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3687 *reg = temp;
3688 }
3689
3690 /**
3691 * Resolve the result of a Gen4-5 CMP instruction to a proper boolean.
3692 *
3693 * CMP on Gen4-5 only sets the LSB of the result; the rest are undefined.
3694 * If we need a proper boolean value, we have to fix it up to be 0 or ~0.
3695 */
3696 void
3697 vec4_visitor::resolve_bool_comparison(ir_rvalue *rvalue, src_reg *reg)
3698 {
3699 assert(devinfo->gen <= 5);
3700
3701 if (!rvalue->type->is_boolean())
3702 return;
3703
3704 src_reg and_result = src_reg(this, rvalue->type);
3705 src_reg neg_result = src_reg(this, rvalue->type);
3706 emit(AND(dst_reg(and_result), *reg, src_reg(1)));
3707 emit(MOV(dst_reg(neg_result), negate(and_result)));
3708 *reg = neg_result;
3709 }
3710
3711 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
3712 void *log_data,
3713 struct gl_program *prog,
3714 const struct brw_sampler_prog_key_data *key_tex,
3715 struct brw_vue_prog_data *prog_data,
3716 struct gl_shader_program *shader_prog,
3717 gl_shader_stage stage,
3718 void *mem_ctx,
3719 bool no_spills,
3720 int shader_time_index)
3721 : backend_shader(compiler, log_data, mem_ctx,
3722 shader_prog, prog, &prog_data->base, stage),
3723 key_tex(key_tex),
3724 prog_data(prog_data),
3725 sanity_param_count(0),
3726 fail_msg(NULL),
3727 first_non_payload_grf(0),
3728 need_all_constants_in_pull_buffer(false),
3729 no_spills(no_spills),
3730 shader_time_index(shader_time_index),
3731 last_scratch(0)
3732 {
3733 this->failed = false;
3734
3735 this->base_ir = NULL;
3736 this->current_annotation = NULL;
3737 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3738
3739 this->variable_ht = hash_table_ctor(0,
3740 hash_table_pointer_hash,
3741 hash_table_pointer_compare);
3742
3743 this->virtual_grf_start = NULL;
3744 this->virtual_grf_end = NULL;
3745 this->live_intervals = NULL;
3746
3747 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3748
3749 this->uniforms = 0;
3750
3751 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3752 * at least one. See setup_uniforms() in brw_vec4.cpp.
3753 */
3754 this->uniform_array_size = 1;
3755 if (prog_data) {
3756 this->uniform_array_size =
3757 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
3758 }
3759
3760 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3761 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3762 }
3763
3764 vec4_visitor::~vec4_visitor()
3765 {
3766 hash_table_dtor(this->variable_ht);
3767 }
3768
3769
3770 void
3771 vec4_visitor::fail(const char *format, ...)
3772 {
3773 va_list va;
3774 char *msg;
3775
3776 if (failed)
3777 return;
3778
3779 failed = true;
3780
3781 va_start(va, format);
3782 msg = ralloc_vasprintf(mem_ctx, format, va);
3783 va_end(va);
3784 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
3785
3786 this->fail_msg = msg;
3787
3788 if (debug_enabled) {
3789 fprintf(stderr, "%s", msg);
3790 }
3791 }
3792
3793 } /* namespace brw */