glsl/types: rename is_dual_slot_double to is_dual_slot_64bit.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "brw_program.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240
241 resolve_ud_negate(&src0);
242 resolve_ud_negate(&src1);
243
244 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
245 inst->conditional_mod = condition;
246
247 return inst;
248 }
249
250 vec4_instruction *
251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
252 {
253 vec4_instruction *inst;
254
255 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
256 dst, index);
257 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
258 inst->mlen = 2;
259
260 return inst;
261 }
262
263 vec4_instruction *
264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
265 const src_reg &index)
266 {
267 vec4_instruction *inst;
268
269 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
270 dst, src, index);
271 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
272 inst->mlen = 3;
273
274 return inst;
275 }
276
277 src_reg
278 vec4_visitor::fix_3src_operand(const src_reg &src)
279 {
280 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
281 * able to use vertical stride of zero to replicate the vec4 uniform, like
282 *
283 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
284 *
285 * But you can't, since vertical stride is always four in three-source
286 * instructions. Instead, insert a MOV instruction to do the replication so
287 * that the three-source instruction can consume it.
288 */
289
290 /* The MOV is only needed if the source is a uniform or immediate. */
291 if (src.file != UNIFORM && src.file != IMM)
292 return src;
293
294 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
295 return src;
296
297 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
298 expanded.type = src.type;
299 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
300 return src_reg(expanded);
301 }
302
303 src_reg
304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
305 {
306 if (!src.abs && !src.negate)
307 return src;
308
309 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
310 resolved.type = src.type;
311 emit(MOV(resolved, src));
312
313 return src_reg(resolved);
314 }
315
316 src_reg
317 vec4_visitor::fix_math_operand(const src_reg &src)
318 {
319 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
320 return src;
321
322 /* The gen6 math instruction ignores the source modifiers --
323 * swizzle, abs, negate, and at least some parts of the register
324 * region description.
325 *
326 * Rather than trying to enumerate all these cases, *always* expand the
327 * operand to a temp GRF for gen6.
328 *
329 * For gen7, keep the operand as-is, except if immediate, which gen7 still
330 * can't use.
331 */
332
333 if (devinfo->gen == 7 && src.file != IMM)
334 return src;
335
336 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
337 expanded.type = src.type;
338 emit(MOV(expanded, src));
339 return src_reg(expanded);
340 }
341
342 vec4_instruction *
343 vec4_visitor::emit_math(enum opcode opcode,
344 const dst_reg &dst,
345 const src_reg &src0, const src_reg &src1)
346 {
347 vec4_instruction *math =
348 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
349
350 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
351 /* MATH on Gen6 must be align1, so we can't do writemasks. */
352 math->dst = dst_reg(this, glsl_type::vec4_type);
353 math->dst.type = dst.type;
354 math = emit(MOV(dst, src_reg(math->dst)));
355 } else if (devinfo->gen < 6) {
356 math->base_mrf = 1;
357 math->mlen = src1.file == BAD_FILE ? 1 : 2;
358 }
359
360 return math;
361 }
362
363 void
364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
365 {
366 if (devinfo->gen < 7) {
367 unreachable("ir_unop_pack_half_2x16 should be lowered");
368 }
369
370 assert(dst.type == BRW_REGISTER_TYPE_UD);
371 assert(src0.type == BRW_REGISTER_TYPE_F);
372
373 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
374 *
375 * Because this instruction does not have a 16-bit floating-point type,
376 * the destination data type must be Word (W).
377 *
378 * The destination must be DWord-aligned and specify a horizontal stride
379 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
380 * each destination channel and the upper word is not modified.
381 *
382 * The above restriction implies that the f32to16 instruction must use
383 * align1 mode, because only in align1 mode is it possible to specify
384 * horizontal stride. We choose here to defy the hardware docs and emit
385 * align16 instructions.
386 *
387 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
388 * instructions. I was partially successful in that the code passed all
389 * tests. However, the code was dubiously correct and fragile, and the
390 * tests were not harsh enough to probe that frailty. Not trusting the
391 * code, I chose instead to remain in align16 mode in defiance of the hw
392 * docs).
393 *
394 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
395 * simulator, emitting a f32to16 in align16 mode with UD as destination
396 * data type is safe. The behavior differs from that specified in the PRM
397 * in that the upper word of each destination channel is cleared to 0.
398 */
399
400 dst_reg tmp_dst(this, glsl_type::uvec2_type);
401 src_reg tmp_src(tmp_dst);
402
403 #if 0
404 /* Verify the undocumented behavior on which the following instructions
405 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
406 * then the result of the bit-or instruction below will be incorrect.
407 *
408 * You should inspect the disasm output in order to verify that the MOV is
409 * not optimized away.
410 */
411 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
412 #endif
413
414 /* Give tmp the form below, where "." means untouched.
415 *
416 * w z y x w z y x
417 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
418 *
419 * That the upper word of each write-channel be 0 is required for the
420 * following bit-shift and bit-or instructions to work. Note that this
421 * relies on the undocumented hardware behavior mentioned above.
422 */
423 tmp_dst.writemask = WRITEMASK_XY;
424 emit(F32TO16(tmp_dst, src0));
425
426 /* Give the write-channels of dst the form:
427 * 0xhhhh0000
428 */
429 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
430 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
431
432 /* Finally, give the write-channels of dst the form of packHalf2x16's
433 * output:
434 * 0xhhhhllll
435 */
436 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
437 emit(OR(dst, src_reg(dst), tmp_src));
438 }
439
440 void
441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
442 {
443 if (devinfo->gen < 7) {
444 unreachable("ir_unop_unpack_half_2x16 should be lowered");
445 }
446
447 assert(dst.type == BRW_REGISTER_TYPE_F);
448 assert(src0.type == BRW_REGISTER_TYPE_UD);
449
450 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
451 *
452 * Because this instruction does not have a 16-bit floating-point type,
453 * the source data type must be Word (W). The destination type must be
454 * F (Float).
455 *
456 * To use W as the source data type, we must adjust horizontal strides,
457 * which is only possible in align1 mode. All my [chadv] attempts at
458 * emitting align1 instructions for unpackHalf2x16 failed to pass the
459 * Piglit tests, so I gave up.
460 *
461 * I've verified that, on gen7 hardware and the simulator, it is safe to
462 * emit f16to32 in align16 mode with UD as source data type.
463 */
464
465 dst_reg tmp_dst(this, glsl_type::uvec2_type);
466 src_reg tmp_src(tmp_dst);
467
468 tmp_dst.writemask = WRITEMASK_X;
469 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
470
471 tmp_dst.writemask = WRITEMASK_Y;
472 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
473
474 dst.writemask = WRITEMASK_XY;
475 emit(F16TO32(dst, tmp_src));
476 }
477
478 void
479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
480 {
481 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
482 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
483 * is not suitable to generate the shift values, but we can use the packed
484 * vector float and a type-converting MOV.
485 */
486 dst_reg shift(this, glsl_type::uvec4_type);
487 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
488
489 dst_reg shifted(this, glsl_type::uvec4_type);
490 src0.swizzle = BRW_SWIZZLE_XXXX;
491 emit(SHR(shifted, src0, src_reg(shift)));
492
493 shifted.type = BRW_REGISTER_TYPE_UB;
494 dst_reg f(this, glsl_type::vec4_type);
495 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
496
497 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
498 }
499
500 void
501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
502 {
503 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
504 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
505 * is not suitable to generate the shift values, but we can use the packed
506 * vector float and a type-converting MOV.
507 */
508 dst_reg shift(this, glsl_type::uvec4_type);
509 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
510
511 dst_reg shifted(this, glsl_type::uvec4_type);
512 src0.swizzle = BRW_SWIZZLE_XXXX;
513 emit(SHR(shifted, src0, src_reg(shift)));
514
515 shifted.type = BRW_REGISTER_TYPE_B;
516 dst_reg f(this, glsl_type::vec4_type);
517 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
518
519 dst_reg scaled(this, glsl_type::vec4_type);
520 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
521
522 dst_reg max(this, glsl_type::vec4_type);
523 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
524 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
525 }
526
527 void
528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
529 {
530 dst_reg saturated(this, glsl_type::vec4_type);
531 vec4_instruction *inst = emit(MOV(saturated, src0));
532 inst->saturate = true;
533
534 dst_reg scaled(this, glsl_type::vec4_type);
535 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
536
537 dst_reg rounded(this, glsl_type::vec4_type);
538 emit(RNDE(rounded, src_reg(scaled)));
539
540 dst_reg u(this, glsl_type::uvec4_type);
541 emit(MOV(u, src_reg(rounded)));
542
543 src_reg bytes(u);
544 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
545 }
546
547 void
548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
549 {
550 dst_reg max(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
552
553 dst_reg min(this, glsl_type::vec4_type);
554 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
555
556 dst_reg scaled(this, glsl_type::vec4_type);
557 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
558
559 dst_reg rounded(this, glsl_type::vec4_type);
560 emit(RNDE(rounded, src_reg(scaled)));
561
562 dst_reg i(this, glsl_type::ivec4_type);
563 emit(MOV(i, src_reg(rounded)));
564
565 src_reg bytes(i);
566 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
567 }
568
569 /*
570 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
571 * false) elements needed to pack a type.
572 */
573 static int
574 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
575 {
576 unsigned int i;
577 int size;
578
579 switch (type->base_type) {
580 case GLSL_TYPE_UINT:
581 case GLSL_TYPE_INT:
582 case GLSL_TYPE_FLOAT:
583 case GLSL_TYPE_BOOL:
584 case GLSL_TYPE_DOUBLE:
585 if (type->is_matrix()) {
586 const glsl_type *col_type = type->column_type();
587 unsigned col_slots =
588 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
589 return type->matrix_columns * col_slots;
590 } else {
591 /* Regardless of size of vector, it gets a vec4. This is bad
592 * packing for things like floats, but otherwise arrays become a
593 * mess. Hopefully a later pass over the code can pack scalars
594 * down if appropriate.
595 */
596 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
597 }
598 case GLSL_TYPE_ARRAY:
599 assert(type->length > 0);
600 return type_size_xvec4(type->fields.array, as_vec4) * type->length;
601 case GLSL_TYPE_STRUCT:
602 size = 0;
603 for (i = 0; i < type->length; i++) {
604 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
605 }
606 return size;
607 case GLSL_TYPE_SUBROUTINE:
608 return 1;
609
610 case GLSL_TYPE_SAMPLER:
611 /* Samplers take up no register space, since they're baked in at
612 * link time.
613 */
614 return 0;
615 case GLSL_TYPE_ATOMIC_UINT:
616 return 0;
617 case GLSL_TYPE_IMAGE:
618 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
619 case GLSL_TYPE_VOID:
620 case GLSL_TYPE_ERROR:
621 case GLSL_TYPE_INTERFACE:
622 case GLSL_TYPE_FUNCTION:
623 unreachable("not reached");
624 }
625
626 return 0;
627 }
628
629 /**
630 * Returns the minimum number of vec4 elements needed to pack a type.
631 *
632 * For simple types, it will return 1 (a single vec4); for matrices, the
633 * number of columns; for array and struct, the sum of the vec4_size of
634 * each of its elements; and for sampler and atomic, zero.
635 *
636 * This method is useful to calculate how much register space is needed to
637 * store a particular type.
638 */
639 extern "C" int
640 type_size_vec4(const struct glsl_type *type)
641 {
642 return type_size_xvec4(type, true);
643 }
644
645 /**
646 * Returns the minimum number of dvec4 elements needed to pack a type.
647 *
648 * For simple types, it will return 1 (a single dvec4); for matrices, the
649 * number of columns; for array and struct, the sum of the dvec4_size of
650 * each of its elements; and for sampler and atomic, zero.
651 *
652 * This method is useful to calculate how much register space is needed to
653 * store a particular type.
654 *
655 * Measuring double-precision vertex inputs as dvec4 is required because
656 * ARB_vertex_attrib_64bit states that these uses the same number of locations
657 * than the single-precision version. That is, two consecutives dvec4 would be
658 * located in location "x" and location "x+1", not "x+2".
659 *
660 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
661 * remap_vs_attrs() will take in account both the location and also if the
662 * type fits in one or two vec4 slots.
663 */
664 extern "C" int
665 type_size_dvec4(const struct glsl_type *type)
666 {
667 return type_size_xvec4(type, false);
668 }
669
670 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
671 {
672 init();
673
674 this->file = VGRF;
675 this->nr = v->alloc.allocate(type_size_vec4(type));
676
677 if (type->is_array() || type->is_record()) {
678 this->swizzle = BRW_SWIZZLE_NOOP;
679 } else {
680 this->swizzle = brw_swizzle_for_size(type->vector_elements);
681 }
682
683 this->type = brw_type_for_base_type(type);
684 }
685
686 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
687 {
688 assert(size > 0);
689
690 init();
691
692 this->file = VGRF;
693 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
694
695 this->swizzle = BRW_SWIZZLE_NOOP;
696
697 this->type = brw_type_for_base_type(type);
698 }
699
700 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
701 {
702 init();
703
704 this->file = VGRF;
705 this->nr = v->alloc.allocate(type_size_vec4(type));
706
707 if (type->is_array() || type->is_record()) {
708 this->writemask = WRITEMASK_XYZW;
709 } else {
710 this->writemask = (1 << type->vector_elements) - 1;
711 }
712
713 this->type = brw_type_for_base_type(type);
714 }
715
716 vec4_instruction *
717 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
718 src_reg src0, src_reg src1)
719 {
720 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
721 inst->conditional_mod = conditionalmod;
722 return inst;
723 }
724
725 vec4_instruction *
726 vec4_visitor::emit_lrp(const dst_reg &dst,
727 const src_reg &x, const src_reg &y, const src_reg &a)
728 {
729 if (devinfo->gen >= 6) {
730 /* Note that the instruction's argument order is reversed from GLSL
731 * and the IR.
732 */
733 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
734 fix_3src_operand(x)));
735 } else {
736 /* Earlier generations don't support three source operations, so we
737 * need to emit x*(1-a) + y*a.
738 */
739 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
740 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
741 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
742 y_times_a.writemask = dst.writemask;
743 one_minus_a.writemask = dst.writemask;
744 x_times_one_minus_a.writemask = dst.writemask;
745
746 emit(MUL(y_times_a, y, a));
747 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
748 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
749 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
750 }
751 }
752
753 /**
754 * Emits the instructions needed to perform a pull constant load. before_block
755 * and before_inst can be NULL in which case the instruction will be appended
756 * to the end of the instruction list.
757 */
758 void
759 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
760 src_reg surf_index,
761 src_reg offset_reg,
762 bblock_t *before_block,
763 vec4_instruction *before_inst)
764 {
765 assert((before_inst == NULL && before_block == NULL) ||
766 (before_inst && before_block));
767
768 vec4_instruction *pull;
769
770 if (devinfo->gen >= 9) {
771 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
772 src_reg header(this, glsl_type::uvec4_type, 2);
773
774 pull = new(mem_ctx)
775 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
776 dst_reg(header));
777
778 if (before_inst)
779 emit_before(before_block, before_inst, pull);
780 else
781 emit(pull);
782
783 dst_reg index_reg = retype(offset(dst_reg(header), 1),
784 offset_reg.type);
785 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
786
787 if (before_inst)
788 emit_before(before_block, before_inst, pull);
789 else
790 emit(pull);
791
792 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
793 dst,
794 surf_index,
795 header);
796 pull->mlen = 2;
797 pull->header_size = 1;
798 } else if (devinfo->gen >= 7) {
799 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
800
801 grf_offset.type = offset_reg.type;
802
803 pull = MOV(grf_offset, offset_reg);
804
805 if (before_inst)
806 emit_before(before_block, before_inst, pull);
807 else
808 emit(pull);
809
810 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
811 dst,
812 surf_index,
813 src_reg(grf_offset));
814 pull->mlen = 1;
815 } else {
816 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
817 dst,
818 surf_index,
819 offset_reg);
820 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
821 pull->mlen = 1;
822 }
823
824 if (before_inst)
825 emit_before(before_block, before_inst, pull);
826 else
827 emit(pull);
828 }
829
830 src_reg
831 vec4_visitor::emit_uniformize(const src_reg &src)
832 {
833 const src_reg chan_index(this, glsl_type::uint_type);
834 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
835 src.type);
836
837 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
838 ->force_writemask_all = true;
839 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
840 ->force_writemask_all = true;
841
842 return src_reg(dst);
843 }
844
845 src_reg
846 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
847 src_reg coordinate, src_reg surface)
848 {
849 vec4_instruction *inst =
850 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
851 dst_reg(this, glsl_type::uvec4_type));
852 inst->base_mrf = 2;
853 inst->src[1] = surface;
854 inst->src[2] = surface;
855
856 int param_base;
857
858 if (devinfo->gen >= 9) {
859 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
860 vec4_instruction *header_inst = new(mem_ctx)
861 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
862 dst_reg(MRF, inst->base_mrf));
863
864 emit(header_inst);
865
866 inst->mlen = 2;
867 inst->header_size = 1;
868 param_base = inst->base_mrf + 1;
869 } else {
870 inst->mlen = 1;
871 param_base = inst->base_mrf;
872 }
873
874 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
875 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
876 int zero_mask = 0xf & ~coord_mask;
877
878 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
879 coordinate));
880
881 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
882 brw_imm_d(0)));
883
884 emit(inst);
885 return src_reg(inst->dst);
886 }
887
888 bool
889 vec4_visitor::is_high_sampler(src_reg sampler)
890 {
891 if (devinfo->gen < 8 && !devinfo->is_haswell)
892 return false;
893
894 return sampler.file != IMM || sampler.ud >= 16;
895 }
896
897 void
898 vec4_visitor::emit_texture(ir_texture_opcode op,
899 dst_reg dest,
900 const glsl_type *dest_type,
901 src_reg coordinate,
902 int coord_components,
903 src_reg shadow_comparitor,
904 src_reg lod, src_reg lod2,
905 src_reg sample_index,
906 uint32_t constant_offset,
907 src_reg offset_value,
908 src_reg mcs,
909 bool is_cube_array,
910 uint32_t surface,
911 src_reg surface_reg,
912 uint32_t sampler,
913 src_reg sampler_reg)
914 {
915 /* The sampler can only meaningfully compute LOD for fragment shader
916 * messages. For all other stages, we change the opcode to TXL and hardcode
917 * the LOD to 0.
918 *
919 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
920 * valid LOD argument.
921 */
922 if (op == ir_tex || op == ir_query_levels) {
923 assert(lod.file == BAD_FILE);
924 lod = brw_imm_f(0.0f);
925 }
926
927 enum opcode opcode;
928 switch (op) {
929 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
930 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
931 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
932 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
933 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
934 SHADER_OPCODE_TXF_CMS); break;
935 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
936 case ir_tg4: opcode = offset_value.file != BAD_FILE
937 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
938 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
939 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
940 case ir_txb:
941 unreachable("TXB is not valid for vertex shaders.");
942 case ir_lod:
943 unreachable("LOD is not valid for vertex shaders.");
944 case ir_samples_identical: {
945 /* There are some challenges implementing this for vec4, and it seems
946 * unlikely to be used anyway. For now, just return false ways.
947 */
948 emit(MOV(dest, brw_imm_ud(0u)));
949 return;
950 }
951 default:
952 unreachable("Unrecognized tex op");
953 }
954
955 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
956
957 inst->offset = constant_offset;
958
959 /* The message header is necessary for:
960 * - Gen4 (always)
961 * - Gen9+ for selecting SIMD4x2
962 * - Texel offsets
963 * - Gather channel selection
964 * - Sampler indices too large to fit in a 4-bit value.
965 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
966 */
967 inst->header_size =
968 (devinfo->gen < 5 || devinfo->gen >= 9 ||
969 inst->offset != 0 || op == ir_tg4 ||
970 op == ir_texture_samples ||
971 is_high_sampler(sampler_reg)) ? 1 : 0;
972 inst->base_mrf = 2;
973 inst->mlen = inst->header_size;
974 inst->dst.writemask = WRITEMASK_XYZW;
975 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
976
977 inst->src[1] = surface_reg;
978 inst->src[2] = sampler_reg;
979
980 /* MRF for the first parameter */
981 int param_base = inst->base_mrf + inst->header_size;
982
983 if (op == ir_txs || op == ir_query_levels) {
984 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
985 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
986 inst->mlen++;
987 } else if (op == ir_texture_samples) {
988 inst->dst.writemask = WRITEMASK_X;
989 } else {
990 /* Load the coordinate */
991 /* FINISHME: gl_clamp_mask and saturate */
992 int coord_mask = (1 << coord_components) - 1;
993 int zero_mask = 0xf & ~coord_mask;
994
995 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
996 coordinate));
997 inst->mlen++;
998
999 if (zero_mask != 0) {
1000 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1001 brw_imm_d(0)));
1002 }
1003 /* Load the shadow comparitor */
1004 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1005 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
1006 WRITEMASK_X),
1007 shadow_comparitor));
1008 inst->mlen++;
1009 }
1010
1011 /* Load the LOD info */
1012 if (op == ir_tex || op == ir_txl) {
1013 int mrf, writemask;
1014 if (devinfo->gen >= 5) {
1015 mrf = param_base + 1;
1016 if (shadow_comparitor.file != BAD_FILE) {
1017 writemask = WRITEMASK_Y;
1018 /* mlen already incremented */
1019 } else {
1020 writemask = WRITEMASK_X;
1021 inst->mlen++;
1022 }
1023 } else /* devinfo->gen == 4 */ {
1024 mrf = param_base;
1025 writemask = WRITEMASK_W;
1026 }
1027 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1028 } else if (op == ir_txf) {
1029 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1030 } else if (op == ir_txf_ms) {
1031 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1032 sample_index));
1033 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1034 /* MCS data is stored in the first two channels of ‘mcs’, but we
1035 * need to get it into the .y and .z channels of the second vec4
1036 * of params.
1037 */
1038 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1039 emit(MOV(dst_reg(MRF, param_base + 1,
1040 glsl_type::uint_type, WRITEMASK_YZ),
1041 mcs));
1042 } else if (devinfo->gen >= 7) {
1043 /* MCS data is in the first channel of `mcs`, but we need to get it into
1044 * the .y channel of the second vec4 of params, so replicate .x across
1045 * the whole vec4 and then mask off everything except .y
1046 */
1047 mcs.swizzle = BRW_SWIZZLE_XXXX;
1048 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1049 mcs));
1050 }
1051 inst->mlen++;
1052 } else if (op == ir_txd) {
1053 const brw_reg_type type = lod.type;
1054
1055 if (devinfo->gen >= 5) {
1056 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1059 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1060 inst->mlen++;
1061
1062 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1063 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1064 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1065 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1066 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1067 inst->mlen++;
1068
1069 if (shadow_comparitor.file != BAD_FILE) {
1070 emit(MOV(dst_reg(MRF, param_base + 2,
1071 shadow_comparitor.type, WRITEMASK_Z),
1072 shadow_comparitor));
1073 }
1074 }
1075 } else /* devinfo->gen == 4 */ {
1076 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1077 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1078 inst->mlen += 2;
1079 }
1080 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1081 if (shadow_comparitor.file != BAD_FILE) {
1082 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1083 shadow_comparitor));
1084 }
1085
1086 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1087 offset_value));
1088 inst->mlen++;
1089 }
1090 }
1091
1092 emit(inst);
1093
1094 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1095 * spec requires layers.
1096 */
1097 if (op == ir_txs) {
1098 if (is_cube_array) {
1099 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1100 writemask(inst->dst, WRITEMASK_Z),
1101 src_reg(inst->dst), brw_imm_d(6));
1102 } else if (devinfo->gen < 7) {
1103 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1104 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1105 src_reg(inst->dst), brw_imm_d(1));
1106 }
1107 }
1108
1109 if (devinfo->gen == 6 && op == ir_tg4) {
1110 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1111 }
1112
1113 if (op == ir_query_levels) {
1114 /* # levels is in .w */
1115 src_reg swizzled(dest);
1116 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1117 SWIZZLE_W, SWIZZLE_W);
1118 emit(MOV(dest, swizzled));
1119 }
1120 }
1121
1122 /**
1123 * Apply workarounds for Gen6 gather with UINT/SINT
1124 */
1125 void
1126 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1127 {
1128 if (!wa)
1129 return;
1130
1131 int width = (wa & WA_8BIT) ? 8 : 16;
1132 dst_reg dst_f = dst;
1133 dst_f.type = BRW_REGISTER_TYPE_F;
1134
1135 /* Convert from UNORM to UINT */
1136 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1137 emit(MOV(dst, src_reg(dst_f)));
1138
1139 if (wa & WA_SIGN) {
1140 /* Reinterpret the UINT value as a signed INT value by
1141 * shifting the sign bit into place, then shifting back
1142 * preserving sign.
1143 */
1144 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1145 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1146 }
1147 }
1148
1149 void
1150 vec4_visitor::gs_emit_vertex(int stream_id)
1151 {
1152 unreachable("not reached");
1153 }
1154
1155 void
1156 vec4_visitor::gs_end_primitive()
1157 {
1158 unreachable("not reached");
1159 }
1160
1161 void
1162 vec4_visitor::emit_ndc_computation()
1163 {
1164 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1165 return;
1166
1167 /* Get the position */
1168 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1169
1170 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1171 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1172 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1173
1174 current_annotation = "NDC";
1175 dst_reg ndc_w = ndc;
1176 ndc_w.writemask = WRITEMASK_W;
1177 src_reg pos_w = pos;
1178 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1179 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1180
1181 dst_reg ndc_xyz = ndc;
1182 ndc_xyz.writemask = WRITEMASK_XYZ;
1183
1184 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1185 }
1186
1187 void
1188 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1189 {
1190 if (devinfo->gen < 6 &&
1191 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1192 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1193 devinfo->has_negative_rhw_bug)) {
1194 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1195 dst_reg header1_w = header1;
1196 header1_w.writemask = WRITEMASK_W;
1197
1198 emit(MOV(header1, brw_imm_ud(0u)));
1199
1200 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1201 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1202
1203 current_annotation = "Point size";
1204 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1205 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1206 }
1207
1208 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1209 current_annotation = "Clipping flags";
1210 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1211 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1212
1213 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1214 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1215 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1216
1217 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1218 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1219 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1220 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1221 }
1222
1223 /* i965 clipping workaround:
1224 * 1) Test for -ve rhw
1225 * 2) If set,
1226 * set ndc = (0,0,0,0)
1227 * set ucp[6] = 1
1228 *
1229 * Later, clipping will detect ucp[6] and ensure the primitive is
1230 * clipped against all fixed planes.
1231 */
1232 if (devinfo->has_negative_rhw_bug &&
1233 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1234 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1235 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1236 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1237 vec4_instruction *inst;
1238 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1239 inst->predicate = BRW_PREDICATE_NORMAL;
1240 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1241 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1242 inst->predicate = BRW_PREDICATE_NORMAL;
1243 }
1244
1245 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1246 } else if (devinfo->gen < 6) {
1247 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1248 } else {
1249 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1250 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1251 dst_reg reg_w = reg;
1252 reg_w.writemask = WRITEMASK_W;
1253 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1254 reg_as_src.type = reg_w.type;
1255 reg_as_src.swizzle = brw_swizzle_for_size(1);
1256 emit(MOV(reg_w, reg_as_src));
1257 }
1258 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1259 dst_reg reg_y = reg;
1260 reg_y.writemask = WRITEMASK_Y;
1261 reg_y.type = BRW_REGISTER_TYPE_D;
1262 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1263 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1264 }
1265 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1266 dst_reg reg_z = reg;
1267 reg_z.writemask = WRITEMASK_Z;
1268 reg_z.type = BRW_REGISTER_TYPE_D;
1269 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1270 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1271 }
1272 }
1273 }
1274
1275 vec4_instruction *
1276 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1277 {
1278 assert(varying < VARYING_SLOT_MAX);
1279 assert(output_reg[varying].type == reg.type);
1280 current_annotation = output_reg_annotation[varying];
1281 if (output_reg[varying].file != BAD_FILE)
1282 return emit(MOV(reg, src_reg(output_reg[varying])));
1283 else
1284 return NULL;
1285 }
1286
1287 void
1288 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1289 {
1290 reg.type = BRW_REGISTER_TYPE_F;
1291 output_reg[varying].type = reg.type;
1292
1293 switch (varying) {
1294 case VARYING_SLOT_PSIZ:
1295 {
1296 /* PSIZ is always in slot 0, and is coupled with other flags. */
1297 current_annotation = "indices, point width, clip flags";
1298 emit_psiz_and_flags(reg);
1299 break;
1300 }
1301 case BRW_VARYING_SLOT_NDC:
1302 current_annotation = "NDC";
1303 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1304 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1305 break;
1306 case VARYING_SLOT_POS:
1307 current_annotation = "gl_Position";
1308 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1309 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1310 break;
1311 case VARYING_SLOT_EDGE:
1312 /* This is present when doing unfilled polygons. We're supposed to copy
1313 * the edge flag from the user-provided vertex array
1314 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1315 * of that attribute (starts as 1.0f). This is then used in clipping to
1316 * determine which edges should be drawn as wireframe.
1317 */
1318 current_annotation = "edge flag";
1319 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1320 glsl_type::float_type, WRITEMASK_XYZW))));
1321 break;
1322 case BRW_VARYING_SLOT_PAD:
1323 /* No need to write to this slot */
1324 break;
1325 default:
1326 emit_generic_urb_slot(reg, varying);
1327 break;
1328 }
1329 }
1330
1331 static int
1332 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1333 {
1334 if (devinfo->gen >= 6) {
1335 /* URB data written (does not include the message header reg) must
1336 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1337 * section 5.4.3.2.2: URB_INTERLEAVED.
1338 *
1339 * URB entries are allocated on a multiple of 1024 bits, so an
1340 * extra 128 bits written here to make the end align to 256 is
1341 * no problem.
1342 */
1343 if ((mlen % 2) != 1)
1344 mlen++;
1345 }
1346
1347 return mlen;
1348 }
1349
1350
1351 /**
1352 * Generates the VUE payload plus the necessary URB write instructions to
1353 * output it.
1354 *
1355 * The VUE layout is documented in Volume 2a.
1356 */
1357 void
1358 vec4_visitor::emit_vertex()
1359 {
1360 /* MRF 0 is reserved for the debugger, so start with message header
1361 * in MRF 1.
1362 */
1363 int base_mrf = 1;
1364 int mrf = base_mrf;
1365 /* In the process of generating our URB write message contents, we
1366 * may need to unspill a register or load from an array. Those
1367 * reads would use MRFs 14-15.
1368 */
1369 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1370
1371 /* The following assertion verifies that max_usable_mrf causes an
1372 * even-numbered amount of URB write data, which will meet gen6's
1373 * requirements for length alignment.
1374 */
1375 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1376
1377 /* First mrf is the g0-based message header containing URB handles and
1378 * such.
1379 */
1380 emit_urb_write_header(mrf++);
1381
1382 if (devinfo->gen < 6) {
1383 emit_ndc_computation();
1384 }
1385
1386 /* We may need to split this up into several URB writes, so do them in a
1387 * loop.
1388 */
1389 int slot = 0;
1390 bool complete = false;
1391 do {
1392 /* URB offset is in URB row increments, and each of our MRFs is half of
1393 * one of those, since we're doing interleaved writes.
1394 */
1395 int offset = slot / 2;
1396
1397 mrf = base_mrf + 1;
1398 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1399 emit_urb_slot(dst_reg(MRF, mrf++),
1400 prog_data->vue_map.slot_to_varying[slot]);
1401
1402 /* If this was max_usable_mrf, we can't fit anything more into this
1403 * URB WRITE. Same thing if we reached the maximum length available.
1404 */
1405 if (mrf > max_usable_mrf ||
1406 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1407 slot++;
1408 break;
1409 }
1410 }
1411
1412 complete = slot >= prog_data->vue_map.num_slots;
1413 current_annotation = "URB write";
1414 vec4_instruction *inst = emit_urb_write_opcode(complete);
1415 inst->base_mrf = base_mrf;
1416 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1417 inst->offset += offset;
1418 } while(!complete);
1419 }
1420
1421
1422 src_reg
1423 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1424 src_reg *reladdr, int reg_offset)
1425 {
1426 /* Because we store the values to scratch interleaved like our
1427 * vertex data, we need to scale the vec4 index by 2.
1428 */
1429 int message_header_scale = 2;
1430
1431 /* Pre-gen6, the message header uses byte offsets instead of vec4
1432 * (16-byte) offset units.
1433 */
1434 if (devinfo->gen < 6)
1435 message_header_scale *= 16;
1436
1437 if (reladdr) {
1438 src_reg index = src_reg(this, glsl_type::int_type);
1439
1440 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1441 brw_imm_d(reg_offset)));
1442 emit_before(block, inst, MUL(dst_reg(index), index,
1443 brw_imm_d(message_header_scale)));
1444
1445 return index;
1446 } else {
1447 return brw_imm_d(reg_offset * message_header_scale);
1448 }
1449 }
1450
1451 /**
1452 * Emits an instruction before @inst to load the value named by @orig_src
1453 * from scratch space at @base_offset to @temp.
1454 *
1455 * @base_offset is measured in 32-byte units (the size of a register).
1456 */
1457 void
1458 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1459 dst_reg temp, src_reg orig_src,
1460 int base_offset)
1461 {
1462 int reg_offset = base_offset + orig_src.reg_offset;
1463 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1464 reg_offset);
1465
1466 emit_before(block, inst, SCRATCH_READ(temp, index));
1467 }
1468
1469 /**
1470 * Emits an instruction after @inst to store the value to be written
1471 * to @orig_dst to scratch space at @base_offset, from @temp.
1472 *
1473 * @base_offset is measured in 32-byte units (the size of a register).
1474 */
1475 void
1476 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1477 int base_offset)
1478 {
1479 int reg_offset = base_offset + inst->dst.reg_offset;
1480 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1481 reg_offset);
1482
1483 /* Create a temporary register to store *inst's result in.
1484 *
1485 * We have to be careful in MOVing from our temporary result register in
1486 * the scratch write. If we swizzle from channels of the temporary that
1487 * weren't initialized, it will confuse live interval analysis, which will
1488 * make spilling fail to make progress.
1489 */
1490 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1491 inst->dst.type),
1492 brw_swizzle_for_mask(inst->dst.writemask));
1493 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1494 inst->dst.writemask));
1495 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1496 if (inst->opcode != BRW_OPCODE_SEL)
1497 write->predicate = inst->predicate;
1498 write->ir = inst->ir;
1499 write->annotation = inst->annotation;
1500 inst->insert_after(block, write);
1501
1502 inst->dst.file = temp.file;
1503 inst->dst.nr = temp.nr;
1504 inst->dst.reg_offset = temp.reg_offset;
1505 inst->dst.reladdr = NULL;
1506 }
1507
1508 /**
1509 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1510 * adds the scratch read(s) before \p inst. The function also checks for
1511 * recursive reladdr scratch accesses, issuing the corresponding scratch
1512 * loads and rewriting reladdr references accordingly.
1513 *
1514 * \return \p src if it did not require a scratch load, otherwise, the
1515 * register holding the result of the scratch load that the caller should
1516 * use to rewrite src.
1517 */
1518 src_reg
1519 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1520 vec4_instruction *inst, src_reg src)
1521 {
1522 /* Resolve recursive reladdr scratch access by calling ourselves
1523 * with src.reladdr
1524 */
1525 if (src.reladdr)
1526 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1527 *src.reladdr);
1528
1529 /* Now handle scratch access on src */
1530 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1531 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1532 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1533 src.nr = temp.nr;
1534 src.reg_offset = temp.reg_offset;
1535 src.reladdr = NULL;
1536 }
1537
1538 return src;
1539 }
1540
1541 /**
1542 * We can't generally support array access in GRF space, because a
1543 * single instruction's destination can only span 2 contiguous
1544 * registers. So, we send all GRF arrays that get variable index
1545 * access to scratch space.
1546 */
1547 void
1548 vec4_visitor::move_grf_array_access_to_scratch()
1549 {
1550 int scratch_loc[this->alloc.count];
1551 memset(scratch_loc, -1, sizeof(scratch_loc));
1552
1553 /* First, calculate the set of virtual GRFs that need to be punted
1554 * to scratch due to having any array access on them, and where in
1555 * scratch.
1556 */
1557 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1558 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1559 if (scratch_loc[inst->dst.nr] == -1) {
1560 scratch_loc[inst->dst.nr] = last_scratch;
1561 last_scratch += this->alloc.sizes[inst->dst.nr];
1562 }
1563
1564 for (src_reg *iter = inst->dst.reladdr;
1565 iter->reladdr;
1566 iter = iter->reladdr) {
1567 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1568 scratch_loc[iter->nr] = last_scratch;
1569 last_scratch += this->alloc.sizes[iter->nr];
1570 }
1571 }
1572 }
1573
1574 for (int i = 0 ; i < 3; i++) {
1575 for (src_reg *iter = &inst->src[i];
1576 iter->reladdr;
1577 iter = iter->reladdr) {
1578 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1579 scratch_loc[iter->nr] = last_scratch;
1580 last_scratch += this->alloc.sizes[iter->nr];
1581 }
1582 }
1583 }
1584 }
1585
1586 /* Now, for anything that will be accessed through scratch, rewrite
1587 * it to load/store. Note that this is a _safe list walk, because
1588 * we may generate a new scratch_write instruction after the one
1589 * we're processing.
1590 */
1591 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1592 /* Set up the annotation tracking for new generated instructions. */
1593 base_ir = inst->ir;
1594 current_annotation = inst->annotation;
1595
1596 /* First handle scratch access on the dst. Notice we have to handle
1597 * the case where the dst's reladdr also points to scratch space.
1598 */
1599 if (inst->dst.reladdr)
1600 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1601 *inst->dst.reladdr);
1602
1603 /* Now that we have handled any (possibly recursive) reladdr scratch
1604 * accesses for dst we can safely do the scratch write for dst itself
1605 */
1606 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1607 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1608
1609 /* Now handle scratch access on any src. In this case, since inst->src[i]
1610 * already is a src_reg, we can just call emit_resolve_reladdr with
1611 * inst->src[i] and it will take care of handling scratch loads for
1612 * both src and src.reladdr (recursively).
1613 */
1614 for (int i = 0 ; i < 3; i++) {
1615 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1616 inst->src[i]);
1617 }
1618 }
1619 }
1620
1621 /**
1622 * Emits an instruction before @inst to load the value named by @orig_src
1623 * from the pull constant buffer (surface) at @base_offset to @temp.
1624 */
1625 void
1626 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1627 dst_reg temp, src_reg orig_src,
1628 int base_offset, src_reg indirect)
1629 {
1630 int reg_offset = base_offset + orig_src.reg_offset;
1631 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1632
1633 src_reg offset;
1634 if (indirect.file != BAD_FILE) {
1635 offset = src_reg(this, glsl_type::uint_type);
1636
1637 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1638 brw_imm_ud(reg_offset * 16)));
1639 } else if (devinfo->gen >= 8) {
1640 /* Store the offset in a GRF so we can send-from-GRF. */
1641 offset = src_reg(this, glsl_type::uint_type);
1642 emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1643 } else {
1644 offset = brw_imm_d(reg_offset * 16);
1645 }
1646
1647 emit_pull_constant_load_reg(temp,
1648 brw_imm_ud(index),
1649 offset,
1650 block, inst);
1651
1652 brw_mark_surface_used(&prog_data->base, index);
1653 }
1654
1655 /**
1656 * Implements array access of uniforms by inserting a
1657 * PULL_CONSTANT_LOAD instruction.
1658 *
1659 * Unlike temporary GRF array access (where we don't support it due to
1660 * the difficulty of doing relative addressing on instruction
1661 * destinations), we could potentially do array access of uniforms
1662 * that were loaded in GRF space as push constants. In real-world
1663 * usage we've seen, though, the arrays being used are always larger
1664 * than we could load as push constants, so just always move all
1665 * uniform array access out to a pull constant buffer.
1666 */
1667 void
1668 vec4_visitor::move_uniform_array_access_to_pull_constants()
1669 {
1670 /* The vulkan dirver doesn't support pull constants other than UBOs so
1671 * everything has to be pushed regardless.
1672 */
1673 if (stage_prog_data->pull_param == NULL) {
1674 split_uniform_registers();
1675 return;
1676 }
1677
1678 int pull_constant_loc[this->uniforms];
1679 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1680
1681 /* First, walk through the instructions and determine which things need to
1682 * be pulled. We mark something as needing to be pulled by setting
1683 * pull_constant_loc to 0.
1684 */
1685 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1686 /* We only care about MOV_INDIRECT of a uniform */
1687 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1688 inst->src[0].file != UNIFORM)
1689 continue;
1690
1691 int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1692
1693 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1694 pull_constant_loc[uniform_nr + j] = 0;
1695 }
1696
1697 /* Next, we walk the list of uniforms and assign real pull constant
1698 * locations and set their corresponding entries in pull_param.
1699 */
1700 for (int j = 0; j < this->uniforms; j++) {
1701 if (pull_constant_loc[j] < 0)
1702 continue;
1703
1704 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1705
1706 for (int i = 0; i < 4; i++) {
1707 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1708 = stage_prog_data->param[j * 4 + i];
1709 }
1710 }
1711
1712 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1713 * instructions to actual uniform pulls.
1714 */
1715 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1716 /* We only care about MOV_INDIRECT of a uniform */
1717 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1718 inst->src[0].file != UNIFORM)
1719 continue;
1720
1721 int uniform_nr = inst->src[0].nr + inst->src[0].reg_offset;
1722
1723 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1724
1725 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1726 pull_constant_loc[uniform_nr], inst->src[1]);
1727 inst->remove(block);
1728 }
1729
1730 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1731 * no need to track them as larger-than-vec4 objects. This will be
1732 * relied on in cutting out unused uniform vectors from push
1733 * constants.
1734 */
1735 split_uniform_registers();
1736 }
1737
1738 void
1739 vec4_visitor::resolve_ud_negate(src_reg *reg)
1740 {
1741 if (reg->type != BRW_REGISTER_TYPE_UD ||
1742 !reg->negate)
1743 return;
1744
1745 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1746 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1747 *reg = temp;
1748 }
1749
1750 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1751 void *log_data,
1752 const struct brw_sampler_prog_key_data *key_tex,
1753 struct brw_vue_prog_data *prog_data,
1754 const nir_shader *shader,
1755 void *mem_ctx,
1756 bool no_spills,
1757 int shader_time_index)
1758 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1759 key_tex(key_tex),
1760 prog_data(prog_data),
1761 fail_msg(NULL),
1762 first_non_payload_grf(0),
1763 need_all_constants_in_pull_buffer(false),
1764 no_spills(no_spills),
1765 shader_time_index(shader_time_index),
1766 last_scratch(0)
1767 {
1768 this->failed = false;
1769
1770 this->base_ir = NULL;
1771 this->current_annotation = NULL;
1772 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1773
1774 this->virtual_grf_start = NULL;
1775 this->virtual_grf_end = NULL;
1776 this->live_intervals = NULL;
1777
1778 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1779
1780 this->uniforms = 0;
1781 }
1782
1783 vec4_visitor::~vec4_visitor()
1784 {
1785 }
1786
1787
1788 void
1789 vec4_visitor::fail(const char *format, ...)
1790 {
1791 va_list va;
1792 char *msg;
1793
1794 if (failed)
1795 return;
1796
1797 failed = true;
1798
1799 va_start(va, format);
1800 msg = ralloc_vasprintf(mem_ctx, format, va);
1801 va_end(va);
1802 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1803
1804 this->fail_msg = msg;
1805
1806 if (debug_enabled) {
1807 fprintf(stderr, "%s", msg);
1808 }
1809 }
1810
1811 } /* namespace brw */