Merge remote-tracking branch 'origin/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "brw_program.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240
241 resolve_ud_negate(&src0);
242 resolve_ud_negate(&src1);
243
244 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
245 inst->conditional_mod = condition;
246
247 return inst;
248 }
249
250 vec4_instruction *
251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
252 {
253 vec4_instruction *inst;
254
255 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
256 dst, index);
257 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
258 inst->mlen = 2;
259
260 return inst;
261 }
262
263 vec4_instruction *
264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
265 const src_reg &index)
266 {
267 vec4_instruction *inst;
268
269 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
270 dst, src, index);
271 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
272 inst->mlen = 3;
273
274 return inst;
275 }
276
277 src_reg
278 vec4_visitor::fix_3src_operand(const src_reg &src)
279 {
280 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
281 * able to use vertical stride of zero to replicate the vec4 uniform, like
282 *
283 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
284 *
285 * But you can't, since vertical stride is always four in three-source
286 * instructions. Instead, insert a MOV instruction to do the replication so
287 * that the three-source instruction can consume it.
288 */
289
290 /* The MOV is only needed if the source is a uniform or immediate. */
291 if (src.file != UNIFORM && src.file != IMM)
292 return src;
293
294 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
295 return src;
296
297 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
298 expanded.type = src.type;
299 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
300 return src_reg(expanded);
301 }
302
303 src_reg
304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
305 {
306 if (!src.abs && !src.negate)
307 return src;
308
309 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
310 resolved.type = src.type;
311 emit(MOV(resolved, src));
312
313 return src_reg(resolved);
314 }
315
316 src_reg
317 vec4_visitor::fix_math_operand(const src_reg &src)
318 {
319 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
320 return src;
321
322 /* The gen6 math instruction ignores the source modifiers --
323 * swizzle, abs, negate, and at least some parts of the register
324 * region description.
325 *
326 * Rather than trying to enumerate all these cases, *always* expand the
327 * operand to a temp GRF for gen6.
328 *
329 * For gen7, keep the operand as-is, except if immediate, which gen7 still
330 * can't use.
331 */
332
333 if (devinfo->gen == 7 && src.file != IMM)
334 return src;
335
336 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
337 expanded.type = src.type;
338 emit(MOV(expanded, src));
339 return src_reg(expanded);
340 }
341
342 vec4_instruction *
343 vec4_visitor::emit_math(enum opcode opcode,
344 const dst_reg &dst,
345 const src_reg &src0, const src_reg &src1)
346 {
347 vec4_instruction *math =
348 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
349
350 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
351 /* MATH on Gen6 must be align1, so we can't do writemasks. */
352 math->dst = dst_reg(this, glsl_type::vec4_type);
353 math->dst.type = dst.type;
354 math = emit(MOV(dst, src_reg(math->dst)));
355 } else if (devinfo->gen < 6) {
356 math->base_mrf = 1;
357 math->mlen = src1.file == BAD_FILE ? 1 : 2;
358 }
359
360 return math;
361 }
362
363 void
364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
365 {
366 if (devinfo->gen < 7) {
367 unreachable("ir_unop_pack_half_2x16 should be lowered");
368 }
369
370 assert(dst.type == BRW_REGISTER_TYPE_UD);
371 assert(src0.type == BRW_REGISTER_TYPE_F);
372
373 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
374 *
375 * Because this instruction does not have a 16-bit floating-point type,
376 * the destination data type must be Word (W).
377 *
378 * The destination must be DWord-aligned and specify a horizontal stride
379 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
380 * each destination channel and the upper word is not modified.
381 *
382 * The above restriction implies that the f32to16 instruction must use
383 * align1 mode, because only in align1 mode is it possible to specify
384 * horizontal stride. We choose here to defy the hardware docs and emit
385 * align16 instructions.
386 *
387 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
388 * instructions. I was partially successful in that the code passed all
389 * tests. However, the code was dubiously correct and fragile, and the
390 * tests were not harsh enough to probe that frailty. Not trusting the
391 * code, I chose instead to remain in align16 mode in defiance of the hw
392 * docs).
393 *
394 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
395 * simulator, emitting a f32to16 in align16 mode with UD as destination
396 * data type is safe. The behavior differs from that specified in the PRM
397 * in that the upper word of each destination channel is cleared to 0.
398 */
399
400 dst_reg tmp_dst(this, glsl_type::uvec2_type);
401 src_reg tmp_src(tmp_dst);
402
403 #if 0
404 /* Verify the undocumented behavior on which the following instructions
405 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
406 * then the result of the bit-or instruction below will be incorrect.
407 *
408 * You should inspect the disasm output in order to verify that the MOV is
409 * not optimized away.
410 */
411 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
412 #endif
413
414 /* Give tmp the form below, where "." means untouched.
415 *
416 * w z y x w z y x
417 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
418 *
419 * That the upper word of each write-channel be 0 is required for the
420 * following bit-shift and bit-or instructions to work. Note that this
421 * relies on the undocumented hardware behavior mentioned above.
422 */
423 tmp_dst.writemask = WRITEMASK_XY;
424 emit(F32TO16(tmp_dst, src0));
425
426 /* Give the write-channels of dst the form:
427 * 0xhhhh0000
428 */
429 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
430 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
431
432 /* Finally, give the write-channels of dst the form of packHalf2x16's
433 * output:
434 * 0xhhhhllll
435 */
436 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
437 emit(OR(dst, src_reg(dst), tmp_src));
438 }
439
440 void
441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
442 {
443 if (devinfo->gen < 7) {
444 unreachable("ir_unop_unpack_half_2x16 should be lowered");
445 }
446
447 assert(dst.type == BRW_REGISTER_TYPE_F);
448 assert(src0.type == BRW_REGISTER_TYPE_UD);
449
450 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
451 *
452 * Because this instruction does not have a 16-bit floating-point type,
453 * the source data type must be Word (W). The destination type must be
454 * F (Float).
455 *
456 * To use W as the source data type, we must adjust horizontal strides,
457 * which is only possible in align1 mode. All my [chadv] attempts at
458 * emitting align1 instructions for unpackHalf2x16 failed to pass the
459 * Piglit tests, so I gave up.
460 *
461 * I've verified that, on gen7 hardware and the simulator, it is safe to
462 * emit f16to32 in align16 mode with UD as source data type.
463 */
464
465 dst_reg tmp_dst(this, glsl_type::uvec2_type);
466 src_reg tmp_src(tmp_dst);
467
468 tmp_dst.writemask = WRITEMASK_X;
469 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
470
471 tmp_dst.writemask = WRITEMASK_Y;
472 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
473
474 dst.writemask = WRITEMASK_XY;
475 emit(F16TO32(dst, tmp_src));
476 }
477
478 void
479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
480 {
481 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
482 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
483 * is not suitable to generate the shift values, but we can use the packed
484 * vector float and a type-converting MOV.
485 */
486 dst_reg shift(this, glsl_type::uvec4_type);
487 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
488
489 dst_reg shifted(this, glsl_type::uvec4_type);
490 src0.swizzle = BRW_SWIZZLE_XXXX;
491 emit(SHR(shifted, src0, src_reg(shift)));
492
493 shifted.type = BRW_REGISTER_TYPE_UB;
494 dst_reg f(this, glsl_type::vec4_type);
495 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
496
497 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
498 }
499
500 void
501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
502 {
503 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
504 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
505 * is not suitable to generate the shift values, but we can use the packed
506 * vector float and a type-converting MOV.
507 */
508 dst_reg shift(this, glsl_type::uvec4_type);
509 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
510
511 dst_reg shifted(this, glsl_type::uvec4_type);
512 src0.swizzle = BRW_SWIZZLE_XXXX;
513 emit(SHR(shifted, src0, src_reg(shift)));
514
515 shifted.type = BRW_REGISTER_TYPE_B;
516 dst_reg f(this, glsl_type::vec4_type);
517 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
518
519 dst_reg scaled(this, glsl_type::vec4_type);
520 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
521
522 dst_reg max(this, glsl_type::vec4_type);
523 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
524 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
525 }
526
527 void
528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
529 {
530 dst_reg saturated(this, glsl_type::vec4_type);
531 vec4_instruction *inst = emit(MOV(saturated, src0));
532 inst->saturate = true;
533
534 dst_reg scaled(this, glsl_type::vec4_type);
535 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
536
537 dst_reg rounded(this, glsl_type::vec4_type);
538 emit(RNDE(rounded, src_reg(scaled)));
539
540 dst_reg u(this, glsl_type::uvec4_type);
541 emit(MOV(u, src_reg(rounded)));
542
543 src_reg bytes(u);
544 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
545 }
546
547 void
548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
549 {
550 dst_reg max(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
552
553 dst_reg min(this, glsl_type::vec4_type);
554 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
555
556 dst_reg scaled(this, glsl_type::vec4_type);
557 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
558
559 dst_reg rounded(this, glsl_type::vec4_type);
560 emit(RNDE(rounded, src_reg(scaled)));
561
562 dst_reg i(this, glsl_type::ivec4_type);
563 emit(MOV(i, src_reg(rounded)));
564
565 src_reg bytes(i);
566 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
567 }
568
569 /**
570 * Returns the minimum number of vec4 elements needed to pack a type.
571 *
572 * For simple types, it will return 1 (a single vec4); for matrices, the
573 * number of columns; for array and struct, the sum of the vec4_size of
574 * each of its elements; and for sampler and atomic, zero.
575 *
576 * This method is useful to calculate how much register space is needed to
577 * store a particular type.
578 */
579 extern "C" int
580 type_size_vec4(const struct glsl_type *type)
581 {
582 unsigned int i;
583 int size;
584
585 switch (type->base_type) {
586 case GLSL_TYPE_UINT:
587 case GLSL_TYPE_INT:
588 case GLSL_TYPE_FLOAT:
589 case GLSL_TYPE_BOOL:
590 if (type->is_matrix()) {
591 return type->matrix_columns;
592 } else {
593 /* Regardless of size of vector, it gets a vec4. This is bad
594 * packing for things like floats, but otherwise arrays become a
595 * mess. Hopefully a later pass over the code can pack scalars
596 * down if appropriate.
597 */
598 return 1;
599 }
600 case GLSL_TYPE_ARRAY:
601 assert(type->length > 0);
602 return type_size_vec4(type->fields.array) * type->length;
603 case GLSL_TYPE_STRUCT:
604 size = 0;
605 for (i = 0; i < type->length; i++) {
606 size += type_size_vec4(type->fields.structure[i].type);
607 }
608 return size;
609 case GLSL_TYPE_SUBROUTINE:
610 return 1;
611
612 case GLSL_TYPE_SAMPLER:
613 /* Samplers take up no register space, since they're baked in at
614 * link time.
615 */
616 return 0;
617 case GLSL_TYPE_ATOMIC_UINT:
618 return 0;
619 case GLSL_TYPE_IMAGE:
620 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
621 case GLSL_TYPE_VOID:
622 case GLSL_TYPE_DOUBLE:
623 case GLSL_TYPE_ERROR:
624 case GLSL_TYPE_INTERFACE:
625 case GLSL_TYPE_FUNCTION:
626 unreachable("not reached");
627 }
628
629 return 0;
630 }
631
632 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
633 {
634 init();
635
636 this->file = VGRF;
637 this->nr = v->alloc.allocate(type_size_vec4(type));
638
639 if (type->is_array() || type->is_record()) {
640 this->swizzle = BRW_SWIZZLE_NOOP;
641 } else {
642 this->swizzle = brw_swizzle_for_size(type->vector_elements);
643 }
644
645 this->type = brw_type_for_base_type(type);
646 }
647
648 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
649 {
650 assert(size > 0);
651
652 init();
653
654 this->file = VGRF;
655 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
656
657 this->swizzle = BRW_SWIZZLE_NOOP;
658
659 this->type = brw_type_for_base_type(type);
660 }
661
662 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
663 {
664 init();
665
666 this->file = VGRF;
667 this->nr = v->alloc.allocate(type_size_vec4(type));
668
669 if (type->is_array() || type->is_record()) {
670 this->writemask = WRITEMASK_XYZW;
671 } else {
672 this->writemask = (1 << type->vector_elements) - 1;
673 }
674
675 this->type = brw_type_for_base_type(type);
676 }
677
678 vec4_instruction *
679 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
680 src_reg src0, src_reg src1)
681 {
682 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
683 inst->conditional_mod = conditionalmod;
684 return inst;
685 }
686
687 vec4_instruction *
688 vec4_visitor::emit_lrp(const dst_reg &dst,
689 const src_reg &x, const src_reg &y, const src_reg &a)
690 {
691 if (devinfo->gen >= 6) {
692 /* Note that the instruction's argument order is reversed from GLSL
693 * and the IR.
694 */
695 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
696 fix_3src_operand(x)));
697 } else {
698 /* Earlier generations don't support three source operations, so we
699 * need to emit x*(1-a) + y*a.
700 */
701 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
702 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
703 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
704 y_times_a.writemask = dst.writemask;
705 one_minus_a.writemask = dst.writemask;
706 x_times_one_minus_a.writemask = dst.writemask;
707
708 emit(MUL(y_times_a, y, a));
709 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
710 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
711 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
712 }
713 }
714
715 /**
716 * Emits the instructions needed to perform a pull constant load. before_block
717 * and before_inst can be NULL in which case the instruction will be appended
718 * to the end of the instruction list.
719 */
720 void
721 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
722 src_reg surf_index,
723 src_reg offset_reg,
724 bblock_t *before_block,
725 vec4_instruction *before_inst)
726 {
727 assert((before_inst == NULL && before_block == NULL) ||
728 (before_inst && before_block));
729
730 vec4_instruction *pull;
731
732 if (devinfo->gen >= 9) {
733 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
734 src_reg header(this, glsl_type::uvec4_type, 2);
735
736 pull = new(mem_ctx)
737 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
738 dst_reg(header));
739
740 if (before_inst)
741 emit_before(before_block, before_inst, pull);
742 else
743 emit(pull);
744
745 dst_reg index_reg = retype(offset(dst_reg(header), 1),
746 offset_reg.type);
747 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
748
749 if (before_inst)
750 emit_before(before_block, before_inst, pull);
751 else
752 emit(pull);
753
754 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
755 dst,
756 surf_index,
757 header);
758 pull->mlen = 2;
759 pull->header_size = 1;
760 } else if (devinfo->gen >= 7) {
761 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
762
763 grf_offset.type = offset_reg.type;
764
765 pull = MOV(grf_offset, offset_reg);
766
767 if (before_inst)
768 emit_before(before_block, before_inst, pull);
769 else
770 emit(pull);
771
772 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
773 dst,
774 surf_index,
775 src_reg(grf_offset));
776 pull->mlen = 1;
777 } else {
778 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
779 dst,
780 surf_index,
781 offset_reg);
782 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
783 pull->mlen = 1;
784 }
785
786 if (before_inst)
787 emit_before(before_block, before_inst, pull);
788 else
789 emit(pull);
790 }
791
792 src_reg
793 vec4_visitor::emit_uniformize(const src_reg &src)
794 {
795 const src_reg chan_index(this, glsl_type::uint_type);
796 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
797 src.type);
798
799 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
800 ->force_writemask_all = true;
801 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
802 ->force_writemask_all = true;
803
804 return src_reg(dst);
805 }
806
807 src_reg
808 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
809 src_reg coordinate, src_reg surface)
810 {
811 vec4_instruction *inst =
812 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
813 dst_reg(this, glsl_type::uvec4_type));
814 inst->base_mrf = 2;
815 inst->src[1] = surface;
816 inst->src[2] = surface;
817
818 int param_base;
819
820 if (devinfo->gen >= 9) {
821 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
822 vec4_instruction *header_inst = new(mem_ctx)
823 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
824 dst_reg(MRF, inst->base_mrf));
825
826 emit(header_inst);
827
828 inst->mlen = 2;
829 inst->header_size = 1;
830 param_base = inst->base_mrf + 1;
831 } else {
832 inst->mlen = 1;
833 param_base = inst->base_mrf;
834 }
835
836 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
837 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
838 int zero_mask = 0xf & ~coord_mask;
839
840 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
841 coordinate));
842
843 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
844 brw_imm_d(0)));
845
846 emit(inst);
847 return src_reg(inst->dst);
848 }
849
850 bool
851 vec4_visitor::is_high_sampler(src_reg sampler)
852 {
853 if (devinfo->gen < 8 && !devinfo->is_haswell)
854 return false;
855
856 return sampler.file != IMM || sampler.ud >= 16;
857 }
858
859 void
860 vec4_visitor::emit_texture(ir_texture_opcode op,
861 dst_reg dest,
862 const glsl_type *dest_type,
863 src_reg coordinate,
864 int coord_components,
865 src_reg shadow_comparitor,
866 src_reg lod, src_reg lod2,
867 src_reg sample_index,
868 uint32_t constant_offset,
869 src_reg offset_value,
870 src_reg mcs,
871 bool is_cube_array,
872 uint32_t surface,
873 src_reg surface_reg,
874 uint32_t sampler,
875 src_reg sampler_reg)
876 {
877 /* The sampler can only meaningfully compute LOD for fragment shader
878 * messages. For all other stages, we change the opcode to TXL and hardcode
879 * the LOD to 0.
880 *
881 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
882 * valid LOD argument.
883 */
884 if (op == ir_tex || op == ir_query_levels) {
885 assert(lod.file == BAD_FILE);
886 lod = brw_imm_f(0.0f);
887 }
888
889 enum opcode opcode;
890 switch (op) {
891 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
892 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
893 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
894 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
895 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
896 SHADER_OPCODE_TXF_CMS); break;
897 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
898 case ir_tg4: opcode = offset_value.file != BAD_FILE
899 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
900 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
901 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
902 case ir_txb:
903 unreachable("TXB is not valid for vertex shaders.");
904 case ir_lod:
905 unreachable("LOD is not valid for vertex shaders.");
906 case ir_samples_identical: {
907 /* There are some challenges implementing this for vec4, and it seems
908 * unlikely to be used anyway. For now, just return false ways.
909 */
910 emit(MOV(dest, brw_imm_ud(0u)));
911 return;
912 }
913 default:
914 unreachable("Unrecognized tex op");
915 }
916
917 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
918
919 inst->offset = constant_offset;
920
921 /* The message header is necessary for:
922 * - Gen4 (always)
923 * - Gen9+ for selecting SIMD4x2
924 * - Texel offsets
925 * - Gather channel selection
926 * - Sampler indices too large to fit in a 4-bit value.
927 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
928 */
929 inst->header_size =
930 (devinfo->gen < 5 || devinfo->gen >= 9 ||
931 inst->offset != 0 || op == ir_tg4 ||
932 op == ir_texture_samples ||
933 is_high_sampler(sampler_reg)) ? 1 : 0;
934 inst->base_mrf = 2;
935 inst->mlen = inst->header_size;
936 inst->dst.writemask = WRITEMASK_XYZW;
937 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
938
939 inst->src[1] = surface_reg;
940 inst->src[2] = sampler_reg;
941
942 /* MRF for the first parameter */
943 int param_base = inst->base_mrf + inst->header_size;
944
945 if (op == ir_txs || op == ir_query_levels) {
946 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
947 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
948 inst->mlen++;
949 } else if (op == ir_texture_samples) {
950 inst->dst.writemask = WRITEMASK_X;
951 } else {
952 /* Load the coordinate */
953 /* FINISHME: gl_clamp_mask and saturate */
954 int coord_mask = (1 << coord_components) - 1;
955 int zero_mask = 0xf & ~coord_mask;
956
957 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
958 coordinate));
959 inst->mlen++;
960
961 if (zero_mask != 0) {
962 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
963 brw_imm_d(0)));
964 }
965 /* Load the shadow comparitor */
966 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
967 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
968 WRITEMASK_X),
969 shadow_comparitor));
970 inst->mlen++;
971 }
972
973 /* Load the LOD info */
974 if (op == ir_tex || op == ir_txl) {
975 int mrf, writemask;
976 if (devinfo->gen >= 5) {
977 mrf = param_base + 1;
978 if (shadow_comparitor.file != BAD_FILE) {
979 writemask = WRITEMASK_Y;
980 /* mlen already incremented */
981 } else {
982 writemask = WRITEMASK_X;
983 inst->mlen++;
984 }
985 } else /* devinfo->gen == 4 */ {
986 mrf = param_base;
987 writemask = WRITEMASK_W;
988 }
989 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
990 } else if (op == ir_txf) {
991 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
992 } else if (op == ir_txf_ms) {
993 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
994 sample_index));
995 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
996 /* MCS data is stored in the first two channels of ‘mcs’, but we
997 * need to get it into the .y and .z channels of the second vec4
998 * of params.
999 */
1000 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1001 emit(MOV(dst_reg(MRF, param_base + 1,
1002 glsl_type::uint_type, WRITEMASK_YZ),
1003 mcs));
1004 } else if (devinfo->gen >= 7) {
1005 /* MCS data is in the first channel of `mcs`, but we need to get it into
1006 * the .y channel of the second vec4 of params, so replicate .x across
1007 * the whole vec4 and then mask off everything except .y
1008 */
1009 mcs.swizzle = BRW_SWIZZLE_XXXX;
1010 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1011 mcs));
1012 }
1013 inst->mlen++;
1014 } else if (op == ir_txd) {
1015 const brw_reg_type type = lod.type;
1016
1017 if (devinfo->gen >= 5) {
1018 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1019 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1020 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1021 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1022 inst->mlen++;
1023
1024 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1025 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1026 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1027 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1028 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1029 inst->mlen++;
1030
1031 if (shadow_comparitor.file != BAD_FILE) {
1032 emit(MOV(dst_reg(MRF, param_base + 2,
1033 shadow_comparitor.type, WRITEMASK_Z),
1034 shadow_comparitor));
1035 }
1036 }
1037 } else /* devinfo->gen == 4 */ {
1038 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1039 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1040 inst->mlen += 2;
1041 }
1042 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1043 if (shadow_comparitor.file != BAD_FILE) {
1044 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1045 shadow_comparitor));
1046 }
1047
1048 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1049 offset_value));
1050 inst->mlen++;
1051 }
1052 }
1053
1054 emit(inst);
1055
1056 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1057 * spec requires layers.
1058 */
1059 if (op == ir_txs && is_cube_array) {
1060 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1061 writemask(inst->dst, WRITEMASK_Z),
1062 src_reg(inst->dst), brw_imm_d(6));
1063 }
1064
1065 if (devinfo->gen == 6 && op == ir_tg4) {
1066 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1067 }
1068
1069 if (op == ir_query_levels) {
1070 /* # levels is in .w */
1071 src_reg swizzled(dest);
1072 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1073 SWIZZLE_W, SWIZZLE_W);
1074 emit(MOV(dest, swizzled));
1075 }
1076 }
1077
1078 /**
1079 * Apply workarounds for Gen6 gather with UINT/SINT
1080 */
1081 void
1082 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1083 {
1084 if (!wa)
1085 return;
1086
1087 int width = (wa & WA_8BIT) ? 8 : 16;
1088 dst_reg dst_f = dst;
1089 dst_f.type = BRW_REGISTER_TYPE_F;
1090
1091 /* Convert from UNORM to UINT */
1092 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1093 emit(MOV(dst, src_reg(dst_f)));
1094
1095 if (wa & WA_SIGN) {
1096 /* Reinterpret the UINT value as a signed INT value by
1097 * shifting the sign bit into place, then shifting back
1098 * preserving sign.
1099 */
1100 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1101 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1102 }
1103 }
1104
1105 void
1106 vec4_visitor::gs_emit_vertex(int stream_id)
1107 {
1108 unreachable("not reached");
1109 }
1110
1111 void
1112 vec4_visitor::gs_end_primitive()
1113 {
1114 unreachable("not reached");
1115 }
1116
1117 void
1118 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1119 dst_reg dst, src_reg surf_offset,
1120 src_reg src0, src_reg src1)
1121 {
1122 unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1123 src_reg src_payload(this, glsl_type::uint_type, mlen);
1124 dst_reg payload(src_payload);
1125 payload.writemask = WRITEMASK_X;
1126
1127 /* Set the atomic operation offset. */
1128 emit(MOV(offset(payload, 0), surf_offset));
1129 unsigned i = 1;
1130
1131 /* Set the atomic operation arguments. */
1132 if (src0.file != BAD_FILE) {
1133 emit(MOV(offset(payload, i), src0));
1134 i++;
1135 }
1136
1137 if (src1.file != BAD_FILE) {
1138 emit(MOV(offset(payload, i), src1));
1139 i++;
1140 }
1141
1142 /* Emit the instruction. Note that this maps to the normal SIMD8
1143 * untyped atomic message on Ivy Bridge, but that's OK because
1144 * unused channels will be masked out.
1145 */
1146 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1147 src_payload,
1148 brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1149 inst->mlen = mlen;
1150 }
1151
1152 void
1153 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1154 src_reg surf_offset)
1155 {
1156 dst_reg offset(this, glsl_type::uint_type);
1157 offset.writemask = WRITEMASK_X;
1158
1159 /* Set the surface read offset. */
1160 emit(MOV(offset, surf_offset));
1161
1162 /* Emit the instruction. Note that this maps to the normal SIMD8
1163 * untyped surface read message, but that's OK because unused
1164 * channels will be masked out.
1165 */
1166 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1167 src_reg(offset),
1168 brw_imm_ud(surf_index), brw_imm_d(1));
1169 inst->mlen = 1;
1170 }
1171
1172 void
1173 vec4_visitor::emit_ndc_computation()
1174 {
1175 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1176 return;
1177
1178 /* Get the position */
1179 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1180
1181 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1182 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1183 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1184
1185 current_annotation = "NDC";
1186 dst_reg ndc_w = ndc;
1187 ndc_w.writemask = WRITEMASK_W;
1188 src_reg pos_w = pos;
1189 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1190 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1191
1192 dst_reg ndc_xyz = ndc;
1193 ndc_xyz.writemask = WRITEMASK_XYZ;
1194
1195 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1196 }
1197
1198 void
1199 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1200 {
1201 if (devinfo->gen < 6 &&
1202 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1203 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1204 devinfo->has_negative_rhw_bug)) {
1205 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1206 dst_reg header1_w = header1;
1207 header1_w.writemask = WRITEMASK_W;
1208
1209 emit(MOV(header1, brw_imm_ud(0u)));
1210
1211 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1212 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1213
1214 current_annotation = "Point size";
1215 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1216 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1217 }
1218
1219 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1220 current_annotation = "Clipping flags";
1221 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1222 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1223
1224 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1225 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1226 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1227
1228 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1229 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1230 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1231 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1232 }
1233
1234 /* i965 clipping workaround:
1235 * 1) Test for -ve rhw
1236 * 2) If set,
1237 * set ndc = (0,0,0,0)
1238 * set ucp[6] = 1
1239 *
1240 * Later, clipping will detect ucp[6] and ensure the primitive is
1241 * clipped against all fixed planes.
1242 */
1243 if (devinfo->has_negative_rhw_bug &&
1244 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1245 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1246 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1247 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1248 vec4_instruction *inst;
1249 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1250 inst->predicate = BRW_PREDICATE_NORMAL;
1251 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1252 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1253 inst->predicate = BRW_PREDICATE_NORMAL;
1254 }
1255
1256 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1257 } else if (devinfo->gen < 6) {
1258 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1259 } else {
1260 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1261 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1262 dst_reg reg_w = reg;
1263 reg_w.writemask = WRITEMASK_W;
1264 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1265 reg_as_src.type = reg_w.type;
1266 reg_as_src.swizzle = brw_swizzle_for_size(1);
1267 emit(MOV(reg_w, reg_as_src));
1268 }
1269 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1270 dst_reg reg_y = reg;
1271 reg_y.writemask = WRITEMASK_Y;
1272 reg_y.type = BRW_REGISTER_TYPE_D;
1273 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1274 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1275 }
1276 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1277 dst_reg reg_z = reg;
1278 reg_z.writemask = WRITEMASK_Z;
1279 reg_z.type = BRW_REGISTER_TYPE_D;
1280 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1281 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1282 }
1283 }
1284 }
1285
1286 vec4_instruction *
1287 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1288 {
1289 assert(varying < VARYING_SLOT_MAX);
1290 assert(output_reg[varying].type == reg.type);
1291 current_annotation = output_reg_annotation[varying];
1292 if (output_reg[varying].file != BAD_FILE)
1293 return emit(MOV(reg, src_reg(output_reg[varying])));
1294 else
1295 return NULL;
1296 }
1297
1298 void
1299 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1300 {
1301 reg.type = BRW_REGISTER_TYPE_F;
1302 output_reg[varying].type = reg.type;
1303
1304 switch (varying) {
1305 case VARYING_SLOT_PSIZ:
1306 {
1307 /* PSIZ is always in slot 0, and is coupled with other flags. */
1308 current_annotation = "indices, point width, clip flags";
1309 emit_psiz_and_flags(reg);
1310 break;
1311 }
1312 case BRW_VARYING_SLOT_NDC:
1313 current_annotation = "NDC";
1314 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1315 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1316 break;
1317 case VARYING_SLOT_POS:
1318 current_annotation = "gl_Position";
1319 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1320 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1321 break;
1322 case VARYING_SLOT_EDGE:
1323 /* This is present when doing unfilled polygons. We're supposed to copy
1324 * the edge flag from the user-provided vertex array
1325 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1326 * of that attribute (starts as 1.0f). This is then used in clipping to
1327 * determine which edges should be drawn as wireframe.
1328 */
1329 current_annotation = "edge flag";
1330 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1331 glsl_type::float_type, WRITEMASK_XYZW))));
1332 break;
1333 case BRW_VARYING_SLOT_PAD:
1334 /* No need to write to this slot */
1335 break;
1336 default:
1337 emit_generic_urb_slot(reg, varying);
1338 break;
1339 }
1340 }
1341
1342 static int
1343 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1344 {
1345 if (devinfo->gen >= 6) {
1346 /* URB data written (does not include the message header reg) must
1347 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1348 * section 5.4.3.2.2: URB_INTERLEAVED.
1349 *
1350 * URB entries are allocated on a multiple of 1024 bits, so an
1351 * extra 128 bits written here to make the end align to 256 is
1352 * no problem.
1353 */
1354 if ((mlen % 2) != 1)
1355 mlen++;
1356 }
1357
1358 return mlen;
1359 }
1360
1361
1362 /**
1363 * Generates the VUE payload plus the necessary URB write instructions to
1364 * output it.
1365 *
1366 * The VUE layout is documented in Volume 2a.
1367 */
1368 void
1369 vec4_visitor::emit_vertex()
1370 {
1371 /* MRF 0 is reserved for the debugger, so start with message header
1372 * in MRF 1.
1373 */
1374 int base_mrf = 1;
1375 int mrf = base_mrf;
1376 /* In the process of generating our URB write message contents, we
1377 * may need to unspill a register or load from an array. Those
1378 * reads would use MRFs 14-15.
1379 */
1380 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1381
1382 /* The following assertion verifies that max_usable_mrf causes an
1383 * even-numbered amount of URB write data, which will meet gen6's
1384 * requirements for length alignment.
1385 */
1386 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1387
1388 /* First mrf is the g0-based message header containing URB handles and
1389 * such.
1390 */
1391 emit_urb_write_header(mrf++);
1392
1393 if (devinfo->gen < 6) {
1394 emit_ndc_computation();
1395 }
1396
1397 /* We may need to split this up into several URB writes, so do them in a
1398 * loop.
1399 */
1400 int slot = 0;
1401 bool complete = false;
1402 do {
1403 /* URB offset is in URB row increments, and each of our MRFs is half of
1404 * one of those, since we're doing interleaved writes.
1405 */
1406 int offset = slot / 2;
1407
1408 mrf = base_mrf + 1;
1409 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1410 emit_urb_slot(dst_reg(MRF, mrf++),
1411 prog_data->vue_map.slot_to_varying[slot]);
1412
1413 /* If this was max_usable_mrf, we can't fit anything more into this
1414 * URB WRITE. Same thing if we reached the maximum length available.
1415 */
1416 if (mrf > max_usable_mrf ||
1417 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1418 slot++;
1419 break;
1420 }
1421 }
1422
1423 complete = slot >= prog_data->vue_map.num_slots;
1424 current_annotation = "URB write";
1425 vec4_instruction *inst = emit_urb_write_opcode(complete);
1426 inst->base_mrf = base_mrf;
1427 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1428 inst->offset += offset;
1429 } while(!complete);
1430 }
1431
1432
1433 src_reg
1434 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1435 src_reg *reladdr, int reg_offset)
1436 {
1437 /* Because we store the values to scratch interleaved like our
1438 * vertex data, we need to scale the vec4 index by 2.
1439 */
1440 int message_header_scale = 2;
1441
1442 /* Pre-gen6, the message header uses byte offsets instead of vec4
1443 * (16-byte) offset units.
1444 */
1445 if (devinfo->gen < 6)
1446 message_header_scale *= 16;
1447
1448 if (reladdr) {
1449 src_reg index = src_reg(this, glsl_type::int_type);
1450
1451 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1452 brw_imm_d(reg_offset)));
1453 emit_before(block, inst, MUL(dst_reg(index), index,
1454 brw_imm_d(message_header_scale)));
1455
1456 return index;
1457 } else {
1458 return brw_imm_d(reg_offset * message_header_scale);
1459 }
1460 }
1461
1462 /**
1463 * Emits an instruction before @inst to load the value named by @orig_src
1464 * from scratch space at @base_offset to @temp.
1465 *
1466 * @base_offset is measured in 32-byte units (the size of a register).
1467 */
1468 void
1469 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1470 dst_reg temp, src_reg orig_src,
1471 int base_offset)
1472 {
1473 int reg_offset = base_offset + orig_src.reg_offset;
1474 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1475 reg_offset);
1476
1477 emit_before(block, inst, SCRATCH_READ(temp, index));
1478 }
1479
1480 /**
1481 * Emits an instruction after @inst to store the value to be written
1482 * to @orig_dst to scratch space at @base_offset, from @temp.
1483 *
1484 * @base_offset is measured in 32-byte units (the size of a register).
1485 */
1486 void
1487 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1488 int base_offset)
1489 {
1490 int reg_offset = base_offset + inst->dst.reg_offset;
1491 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1492 reg_offset);
1493
1494 /* Create a temporary register to store *inst's result in.
1495 *
1496 * We have to be careful in MOVing from our temporary result register in
1497 * the scratch write. If we swizzle from channels of the temporary that
1498 * weren't initialized, it will confuse live interval analysis, which will
1499 * make spilling fail to make progress.
1500 */
1501 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1502 inst->dst.type),
1503 brw_swizzle_for_mask(inst->dst.writemask));
1504 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1505 inst->dst.writemask));
1506 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1507 if (inst->opcode != BRW_OPCODE_SEL)
1508 write->predicate = inst->predicate;
1509 write->ir = inst->ir;
1510 write->annotation = inst->annotation;
1511 inst->insert_after(block, write);
1512
1513 inst->dst.file = temp.file;
1514 inst->dst.nr = temp.nr;
1515 inst->dst.reg_offset = temp.reg_offset;
1516 inst->dst.reladdr = NULL;
1517 }
1518
1519 /**
1520 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1521 * adds the scratch read(s) before \p inst. The function also checks for
1522 * recursive reladdr scratch accesses, issuing the corresponding scratch
1523 * loads and rewriting reladdr references accordingly.
1524 *
1525 * \return \p src if it did not require a scratch load, otherwise, the
1526 * register holding the result of the scratch load that the caller should
1527 * use to rewrite src.
1528 */
1529 src_reg
1530 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1531 vec4_instruction *inst, src_reg src)
1532 {
1533 /* Resolve recursive reladdr scratch access by calling ourselves
1534 * with src.reladdr
1535 */
1536 if (src.reladdr)
1537 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1538 *src.reladdr);
1539
1540 /* Now handle scratch access on src */
1541 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1542 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1543 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1544 src.nr = temp.nr;
1545 src.reg_offset = temp.reg_offset;
1546 src.reladdr = NULL;
1547 }
1548
1549 return src;
1550 }
1551
1552 /**
1553 * We can't generally support array access in GRF space, because a
1554 * single instruction's destination can only span 2 contiguous
1555 * registers. So, we send all GRF arrays that get variable index
1556 * access to scratch space.
1557 */
1558 void
1559 vec4_visitor::move_grf_array_access_to_scratch()
1560 {
1561 int scratch_loc[this->alloc.count];
1562 memset(scratch_loc, -1, sizeof(scratch_loc));
1563
1564 /* First, calculate the set of virtual GRFs that need to be punted
1565 * to scratch due to having any array access on them, and where in
1566 * scratch.
1567 */
1568 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1569 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1570 if (scratch_loc[inst->dst.nr] == -1) {
1571 scratch_loc[inst->dst.nr] = last_scratch;
1572 last_scratch += this->alloc.sizes[inst->dst.nr];
1573 }
1574
1575 for (src_reg *iter = inst->dst.reladdr;
1576 iter->reladdr;
1577 iter = iter->reladdr) {
1578 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1579 scratch_loc[iter->nr] = last_scratch;
1580 last_scratch += this->alloc.sizes[iter->nr];
1581 }
1582 }
1583 }
1584
1585 for (int i = 0 ; i < 3; i++) {
1586 for (src_reg *iter = &inst->src[i];
1587 iter->reladdr;
1588 iter = iter->reladdr) {
1589 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1590 scratch_loc[iter->nr] = last_scratch;
1591 last_scratch += this->alloc.sizes[iter->nr];
1592 }
1593 }
1594 }
1595 }
1596
1597 /* Now, for anything that will be accessed through scratch, rewrite
1598 * it to load/store. Note that this is a _safe list walk, because
1599 * we may generate a new scratch_write instruction after the one
1600 * we're processing.
1601 */
1602 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1603 /* Set up the annotation tracking for new generated instructions. */
1604 base_ir = inst->ir;
1605 current_annotation = inst->annotation;
1606
1607 /* First handle scratch access on the dst. Notice we have to handle
1608 * the case where the dst's reladdr also points to scratch space.
1609 */
1610 if (inst->dst.reladdr)
1611 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1612 *inst->dst.reladdr);
1613
1614 /* Now that we have handled any (possibly recursive) reladdr scratch
1615 * accesses for dst we can safely do the scratch write for dst itself
1616 */
1617 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1618 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1619
1620 /* Now handle scratch access on any src. In this case, since inst->src[i]
1621 * already is a src_reg, we can just call emit_resolve_reladdr with
1622 * inst->src[i] and it will take care of handling scratch loads for
1623 * both src and src.reladdr (recursively).
1624 */
1625 for (int i = 0 ; i < 3; i++) {
1626 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1627 inst->src[i]);
1628 }
1629 }
1630 }
1631
1632 /**
1633 * Emits an instruction before @inst to load the value named by @orig_src
1634 * from the pull constant buffer (surface) at @base_offset to @temp.
1635 */
1636 void
1637 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1638 dst_reg temp, src_reg orig_src,
1639 int base_offset, src_reg indirect)
1640 {
1641 int reg_offset = base_offset + orig_src.reg_offset;
1642 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1643
1644 src_reg offset;
1645 if (indirect.file != BAD_FILE) {
1646 offset = src_reg(this, glsl_type::int_type);
1647
1648 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1649 brw_imm_d(reg_offset * 16)));
1650 } else if (devinfo->gen >= 8) {
1651 /* Store the offset in a GRF so we can send-from-GRF. */
1652 offset = src_reg(this, glsl_type::int_type);
1653 emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
1654 } else {
1655 offset = brw_imm_d(reg_offset * 16);
1656 }
1657
1658 emit_pull_constant_load_reg(temp,
1659 brw_imm_ud(index),
1660 offset,
1661 block, inst);
1662
1663 brw_mark_surface_used(&prog_data->base, index);
1664 }
1665
1666 /**
1667 * Implements array access of uniforms by inserting a
1668 * PULL_CONSTANT_LOAD instruction.
1669 *
1670 * Unlike temporary GRF array access (where we don't support it due to
1671 * the difficulty of doing relative addressing on instruction
1672 * destinations), we could potentially do array access of uniforms
1673 * that were loaded in GRF space as push constants. In real-world
1674 * usage we've seen, though, the arrays being used are always larger
1675 * than we could load as push constants, so just always move all
1676 * uniform array access out to a pull constant buffer.
1677 */
1678 void
1679 vec4_visitor::move_uniform_array_access_to_pull_constants()
1680 {
1681 int pull_constant_loc[this->uniforms];
1682 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1683
1684 /* First, walk through the instructions and determine which things need to
1685 * be pulled. We mark something as needing to be pulled by setting
1686 * pull_constant_loc to 0.
1687 */
1688 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1689 /* We only care about MOV_INDIRECT of a uniform */
1690 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1691 inst->src[0].file != UNIFORM)
1692 continue;
1693
1694 int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1695
1696 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1697 pull_constant_loc[uniform_nr + j] = 0;
1698 }
1699
1700 /* Next, we walk the list of uniforms and assign real pull constant
1701 * locations and set their corresponding entries in pull_param.
1702 */
1703 for (int j = 0; j < this->uniforms; j++) {
1704 if (pull_constant_loc[j] < 0)
1705 continue;
1706
1707 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1708
1709 for (int i = 0; i < 4; i++) {
1710 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1711 = stage_prog_data->param[j * 4 + i];
1712 }
1713 }
1714
1715 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1716 * instructions to actual uniform pulls.
1717 */
1718 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1719 /* We only care about MOV_INDIRECT of a uniform */
1720 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1721 inst->src[0].file != UNIFORM)
1722 continue;
1723
1724 int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1725
1726 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1727
1728 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1729 pull_constant_loc[uniform_nr], inst->src[1]);
1730 inst->remove(block);
1731 }
1732
1733 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1734 * no need to track them as larger-than-vec4 objects. This will be
1735 * relied on in cutting out unused uniform vectors from push
1736 * constants.
1737 */
1738 split_uniform_registers();
1739 }
1740
1741 void
1742 vec4_visitor::resolve_ud_negate(src_reg *reg)
1743 {
1744 if (reg->type != BRW_REGISTER_TYPE_UD ||
1745 !reg->negate)
1746 return;
1747
1748 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1749 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1750 *reg = temp;
1751 }
1752
1753 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1754 void *log_data,
1755 const struct brw_sampler_prog_key_data *key_tex,
1756 struct brw_vue_prog_data *prog_data,
1757 const nir_shader *shader,
1758 void *mem_ctx,
1759 bool no_spills,
1760 int shader_time_index)
1761 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1762 key_tex(key_tex),
1763 prog_data(prog_data),
1764 fail_msg(NULL),
1765 first_non_payload_grf(0),
1766 need_all_constants_in_pull_buffer(false),
1767 no_spills(no_spills),
1768 shader_time_index(shader_time_index),
1769 last_scratch(0)
1770 {
1771 this->failed = false;
1772
1773 this->base_ir = NULL;
1774 this->current_annotation = NULL;
1775 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1776
1777 this->virtual_grf_start = NULL;
1778 this->virtual_grf_end = NULL;
1779 this->live_intervals = NULL;
1780
1781 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1782
1783 this->uniforms = 0;
1784 }
1785
1786 vec4_visitor::~vec4_visitor()
1787 {
1788 }
1789
1790
1791 void
1792 vec4_visitor::fail(const char *format, ...)
1793 {
1794 va_list va;
1795 char *msg;
1796
1797 if (failed)
1798 return;
1799
1800 failed = true;
1801
1802 va_start(va, format);
1803 msg = ralloc_vasprintf(mem_ctx, format, va);
1804 va_end(va);
1805 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1806
1807 this->fail_msg = msg;
1808
1809 if (debug_enabled) {
1810 fprintf(stderr, "%s", msg);
1811 }
1812 }
1813
1814 } /* namespace brw */