i965: Handle unwritten PSIZ/VIEWPORT/LAYER outputs in vec4 shaders.
[mesa.git] / src / intel / compiler / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27
28 namespace brw {
29
30 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
31 const src_reg &src0, const src_reg &src1,
32 const src_reg &src2)
33 {
34 this->opcode = opcode;
35 this->dst = dst;
36 this->src[0] = src0;
37 this->src[1] = src1;
38 this->src[2] = src2;
39 this->saturate = false;
40 this->force_writemask_all = false;
41 this->no_dd_clear = false;
42 this->no_dd_check = false;
43 this->writes_accumulator = false;
44 this->conditional_mod = BRW_CONDITIONAL_NONE;
45 this->predicate = BRW_PREDICATE_NONE;
46 this->predicate_inverse = false;
47 this->target = 0;
48 this->shadow_compare = false;
49 this->ir = NULL;
50 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
51 this->header_size = 0;
52 this->flag_subreg = 0;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->exec_size = 8;
57 this->group = 0;
58 this->size_written = (dst.file == BAD_FILE ?
59 0 : this->exec_size * type_sz(dst.type));
60 this->annotation = NULL;
61 }
62
63 vec4_instruction *
64 vec4_visitor::emit(vec4_instruction *inst)
65 {
66 inst->ir = this->base_ir;
67 inst->annotation = this->current_annotation;
68
69 this->instructions.push_tail(inst);
70
71 return inst;
72 }
73
74 vec4_instruction *
75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
76 vec4_instruction *new_inst)
77 {
78 new_inst->ir = inst->ir;
79 new_inst->annotation = inst->annotation;
80
81 inst->insert_before(block, new_inst);
82
83 return inst;
84 }
85
86 vec4_instruction *
87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
88 const src_reg &src1, const src_reg &src2)
89 {
90 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
91 }
92
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
96 const src_reg &src1)
97 {
98 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
103 {
104 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
109 {
110 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
111 }
112
113 vec4_instruction *
114 vec4_visitor::emit(enum opcode opcode)
115 {
116 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
117 }
118
119 #define ALU1(op) \
120 vec4_instruction * \
121 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
122 { \
123 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
124 }
125
126 #define ALU2(op) \
127 vec4_instruction * \
128 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
129 const src_reg &src1) \
130 { \
131 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
132 src0, src1); \
133 }
134
135 #define ALU2_ACC(op) \
136 vec4_instruction * \
137 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
138 const src_reg &src1) \
139 { \
140 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
141 BRW_OPCODE_##op, dst, src0, src1); \
142 inst->writes_accumulator = true; \
143 return inst; \
144 }
145
146 #define ALU3(op) \
147 vec4_instruction * \
148 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
149 const src_reg &src1, const src_reg &src2) \
150 { \
151 assert(devinfo->gen >= 6); \
152 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
153 src0, src1, src2); \
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU1(F32TO16)
163 ALU1(F16TO32)
164 ALU2(ADD)
165 ALU2(MUL)
166 ALU2_ACC(MACH)
167 ALU2(AND)
168 ALU2(OR)
169 ALU2(XOR)
170 ALU2(DP3)
171 ALU2(DP4)
172 ALU2(DPH)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(MAC)
188 ALU1(DIM)
189
190 /** Gen4 predicated IF. */
191 vec4_instruction *
192 vec4_visitor::IF(enum brw_predicate predicate)
193 {
194 vec4_instruction *inst;
195
196 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
197 inst->predicate = predicate;
198
199 return inst;
200 }
201
202 /** Gen6 IF with embedded comparison. */
203 vec4_instruction *
204 vec4_visitor::IF(src_reg src0, src_reg src1,
205 enum brw_conditional_mod condition)
206 {
207 assert(devinfo->gen == 6);
208
209 vec4_instruction *inst;
210
211 resolve_ud_negate(&src0);
212 resolve_ud_negate(&src1);
213
214 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
215 src0, src1);
216 inst->conditional_mod = condition;
217
218 return inst;
219 }
220
221 /**
222 * CMP: Sets the low bit of the destination channels with the result
223 * of the comparison, while the upper bits are undefined, and updates
224 * the flag register with the packed 16 bits of the result.
225 */
226 vec4_instruction *
227 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
228 enum brw_conditional_mod condition)
229 {
230 vec4_instruction *inst;
231
232 /* Take the instruction:
233 *
234 * CMP null<d> src0<f> src1<f>
235 *
236 * Original gen4 does type conversion to the destination type before
237 * comparison, producing garbage results for floating point comparisons.
238 *
239 * The destination type doesn't matter on newer generations, so we set the
240 * type to match src0 so we can compact the instruction.
241 */
242 dst.type = src0.type;
243
244 resolve_ud_negate(&src0);
245 resolve_ud_negate(&src1);
246
247 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
248 inst->conditional_mod = condition;
249
250 return inst;
251 }
252
253 vec4_instruction *
254 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
255 {
256 vec4_instruction *inst;
257
258 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
259 dst, index);
260 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
261 inst->mlen = 2;
262
263 return inst;
264 }
265
266 vec4_instruction *
267 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
268 const src_reg &index)
269 {
270 vec4_instruction *inst;
271
272 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
273 dst, src, index);
274 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
275 inst->mlen = 3;
276
277 return inst;
278 }
279
280 src_reg
281 vec4_visitor::fix_3src_operand(const src_reg &src)
282 {
283 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
284 * able to use vertical stride of zero to replicate the vec4 uniform, like
285 *
286 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
287 *
288 * But you can't, since vertical stride is always four in three-source
289 * instructions. Instead, insert a MOV instruction to do the replication so
290 * that the three-source instruction can consume it.
291 */
292
293 /* The MOV is only needed if the source is a uniform or immediate. */
294 if (src.file != UNIFORM && src.file != IMM)
295 return src;
296
297 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
298 return src;
299
300 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
301 expanded.type = src.type;
302 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
303 return src_reg(expanded);
304 }
305
306 src_reg
307 vec4_visitor::resolve_source_modifiers(const src_reg &src)
308 {
309 if (!src.abs && !src.negate)
310 return src;
311
312 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
313 resolved.type = src.type;
314 emit(MOV(resolved, src));
315
316 return src_reg(resolved);
317 }
318
319 src_reg
320 vec4_visitor::fix_math_operand(const src_reg &src)
321 {
322 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
323 return src;
324
325 /* The gen6 math instruction ignores the source modifiers --
326 * swizzle, abs, negate, and at least some parts of the register
327 * region description.
328 *
329 * Rather than trying to enumerate all these cases, *always* expand the
330 * operand to a temp GRF for gen6.
331 *
332 * For gen7, keep the operand as-is, except if immediate, which gen7 still
333 * can't use.
334 */
335
336 if (devinfo->gen == 7 && src.file != IMM)
337 return src;
338
339 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
340 expanded.type = src.type;
341 emit(MOV(expanded, src));
342 return src_reg(expanded);
343 }
344
345 vec4_instruction *
346 vec4_visitor::emit_math(enum opcode opcode,
347 const dst_reg &dst,
348 const src_reg &src0, const src_reg &src1)
349 {
350 vec4_instruction *math =
351 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
352
353 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
354 /* MATH on Gen6 must be align1, so we can't do writemasks. */
355 math->dst = dst_reg(this, glsl_type::vec4_type);
356 math->dst.type = dst.type;
357 math = emit(MOV(dst, src_reg(math->dst)));
358 } else if (devinfo->gen < 6) {
359 math->base_mrf = 1;
360 math->mlen = src1.file == BAD_FILE ? 1 : 2;
361 }
362
363 return math;
364 }
365
366 void
367 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
368 {
369 if (devinfo->gen < 7) {
370 unreachable("ir_unop_pack_half_2x16 should be lowered");
371 }
372
373 assert(dst.type == BRW_REGISTER_TYPE_UD);
374 assert(src0.type == BRW_REGISTER_TYPE_F);
375
376 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
377 *
378 * Because this instruction does not have a 16-bit floating-point type,
379 * the destination data type must be Word (W).
380 *
381 * The destination must be DWord-aligned and specify a horizontal stride
382 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
383 * each destination channel and the upper word is not modified.
384 *
385 * The above restriction implies that the f32to16 instruction must use
386 * align1 mode, because only in align1 mode is it possible to specify
387 * horizontal stride. We choose here to defy the hardware docs and emit
388 * align16 instructions.
389 *
390 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
391 * instructions. I was partially successful in that the code passed all
392 * tests. However, the code was dubiously correct and fragile, and the
393 * tests were not harsh enough to probe that frailty. Not trusting the
394 * code, I chose instead to remain in align16 mode in defiance of the hw
395 * docs).
396 *
397 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
398 * simulator, emitting a f32to16 in align16 mode with UD as destination
399 * data type is safe. The behavior differs from that specified in the PRM
400 * in that the upper word of each destination channel is cleared to 0.
401 */
402
403 dst_reg tmp_dst(this, glsl_type::uvec2_type);
404 src_reg tmp_src(tmp_dst);
405
406 #if 0
407 /* Verify the undocumented behavior on which the following instructions
408 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
409 * then the result of the bit-or instruction below will be incorrect.
410 *
411 * You should inspect the disasm output in order to verify that the MOV is
412 * not optimized away.
413 */
414 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
415 #endif
416
417 /* Give tmp the form below, where "." means untouched.
418 *
419 * w z y x w z y x
420 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
421 *
422 * That the upper word of each write-channel be 0 is required for the
423 * following bit-shift and bit-or instructions to work. Note that this
424 * relies on the undocumented hardware behavior mentioned above.
425 */
426 tmp_dst.writemask = WRITEMASK_XY;
427 emit(F32TO16(tmp_dst, src0));
428
429 /* Give the write-channels of dst the form:
430 * 0xhhhh0000
431 */
432 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
433 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
434
435 /* Finally, give the write-channels of dst the form of packHalf2x16's
436 * output:
437 * 0xhhhhllll
438 */
439 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
440 emit(OR(dst, src_reg(dst), tmp_src));
441 }
442
443 void
444 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
445 {
446 if (devinfo->gen < 7) {
447 unreachable("ir_unop_unpack_half_2x16 should be lowered");
448 }
449
450 assert(dst.type == BRW_REGISTER_TYPE_F);
451 assert(src0.type == BRW_REGISTER_TYPE_UD);
452
453 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
454 *
455 * Because this instruction does not have a 16-bit floating-point type,
456 * the source data type must be Word (W). The destination type must be
457 * F (Float).
458 *
459 * To use W as the source data type, we must adjust horizontal strides,
460 * which is only possible in align1 mode. All my [chadv] attempts at
461 * emitting align1 instructions for unpackHalf2x16 failed to pass the
462 * Piglit tests, so I gave up.
463 *
464 * I've verified that, on gen7 hardware and the simulator, it is safe to
465 * emit f16to32 in align16 mode with UD as source data type.
466 */
467
468 dst_reg tmp_dst(this, glsl_type::uvec2_type);
469 src_reg tmp_src(tmp_dst);
470
471 tmp_dst.writemask = WRITEMASK_X;
472 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
473
474 tmp_dst.writemask = WRITEMASK_Y;
475 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
476
477 dst.writemask = WRITEMASK_XY;
478 emit(F16TO32(dst, tmp_src));
479 }
480
481 void
482 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
483 {
484 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
485 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
486 * is not suitable to generate the shift values, but we can use the packed
487 * vector float and a type-converting MOV.
488 */
489 dst_reg shift(this, glsl_type::uvec4_type);
490 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
491
492 dst_reg shifted(this, glsl_type::uvec4_type);
493 src0.swizzle = BRW_SWIZZLE_XXXX;
494 emit(SHR(shifted, src0, src_reg(shift)));
495
496 shifted.type = BRW_REGISTER_TYPE_UB;
497 dst_reg f(this, glsl_type::vec4_type);
498 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
499
500 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
501 }
502
503 void
504 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
505 {
506 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
507 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
508 * is not suitable to generate the shift values, but we can use the packed
509 * vector float and a type-converting MOV.
510 */
511 dst_reg shift(this, glsl_type::uvec4_type);
512 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
513
514 dst_reg shifted(this, glsl_type::uvec4_type);
515 src0.swizzle = BRW_SWIZZLE_XXXX;
516 emit(SHR(shifted, src0, src_reg(shift)));
517
518 shifted.type = BRW_REGISTER_TYPE_B;
519 dst_reg f(this, glsl_type::vec4_type);
520 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
521
522 dst_reg scaled(this, glsl_type::vec4_type);
523 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
524
525 dst_reg max(this, glsl_type::vec4_type);
526 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
527 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
528 }
529
530 void
531 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
532 {
533 dst_reg saturated(this, glsl_type::vec4_type);
534 vec4_instruction *inst = emit(MOV(saturated, src0));
535 inst->saturate = true;
536
537 dst_reg scaled(this, glsl_type::vec4_type);
538 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
539
540 dst_reg rounded(this, glsl_type::vec4_type);
541 emit(RNDE(rounded, src_reg(scaled)));
542
543 dst_reg u(this, glsl_type::uvec4_type);
544 emit(MOV(u, src_reg(rounded)));
545
546 src_reg bytes(u);
547 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
548 }
549
550 void
551 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
552 {
553 dst_reg max(this, glsl_type::vec4_type);
554 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
555
556 dst_reg min(this, glsl_type::vec4_type);
557 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
558
559 dst_reg scaled(this, glsl_type::vec4_type);
560 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
561
562 dst_reg rounded(this, glsl_type::vec4_type);
563 emit(RNDE(rounded, src_reg(scaled)));
564
565 dst_reg i(this, glsl_type::ivec4_type);
566 emit(MOV(i, src_reg(rounded)));
567
568 src_reg bytes(i);
569 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
570 }
571
572 /*
573 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
574 * false) elements needed to pack a type.
575 */
576 static int
577 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 case GLSL_TYPE_DOUBLE:
588 case GLSL_TYPE_UINT64:
589 case GLSL_TYPE_INT64:
590 if (type->is_matrix()) {
591 const glsl_type *col_type = type->column_type();
592 unsigned col_slots =
593 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
594 return type->matrix_columns * col_slots;
595 } else {
596 /* Regardless of size of vector, it gets a vec4. This is bad
597 * packing for things like floats, but otherwise arrays become a
598 * mess. Hopefully a later pass over the code can pack scalars
599 * down if appropriate.
600 */
601 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
602 }
603 case GLSL_TYPE_ARRAY:
604 assert(type->length > 0);
605 return type_size_xvec4(type->fields.array, as_vec4) * type->length;
606 case GLSL_TYPE_STRUCT:
607 size = 0;
608 for (i = 0; i < type->length; i++) {
609 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
610 }
611 return size;
612 case GLSL_TYPE_SUBROUTINE:
613 return 1;
614
615 case GLSL_TYPE_SAMPLER:
616 /* Samplers take up no register space, since they're baked in at
617 * link time.
618 */
619 return 0;
620 case GLSL_TYPE_ATOMIC_UINT:
621 return 0;
622 case GLSL_TYPE_IMAGE:
623 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
624 case GLSL_TYPE_VOID:
625 case GLSL_TYPE_ERROR:
626 case GLSL_TYPE_INTERFACE:
627 case GLSL_TYPE_FUNCTION:
628 unreachable("not reached");
629 }
630
631 return 0;
632 }
633
634 /**
635 * Returns the minimum number of vec4 elements needed to pack a type.
636 *
637 * For simple types, it will return 1 (a single vec4); for matrices, the
638 * number of columns; for array and struct, the sum of the vec4_size of
639 * each of its elements; and for sampler and atomic, zero.
640 *
641 * This method is useful to calculate how much register space is needed to
642 * store a particular type.
643 */
644 extern "C" int
645 type_size_vec4(const struct glsl_type *type)
646 {
647 return type_size_xvec4(type, true);
648 }
649
650 /**
651 * Returns the minimum number of dvec4 elements needed to pack a type.
652 *
653 * For simple types, it will return 1 (a single dvec4); for matrices, the
654 * number of columns; for array and struct, the sum of the dvec4_size of
655 * each of its elements; and for sampler and atomic, zero.
656 *
657 * This method is useful to calculate how much register space is needed to
658 * store a particular type.
659 *
660 * Measuring double-precision vertex inputs as dvec4 is required because
661 * ARB_vertex_attrib_64bit states that these uses the same number of locations
662 * than the single-precision version. That is, two consecutives dvec4 would be
663 * located in location "x" and location "x+1", not "x+2".
664 *
665 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
666 * remap_vs_attrs() will take in account both the location and also if the
667 * type fits in one or two vec4 slots.
668 */
669 extern "C" int
670 type_size_dvec4(const struct glsl_type *type)
671 {
672 return type_size_xvec4(type, false);
673 }
674
675 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
676 {
677 init();
678
679 this->file = VGRF;
680 this->nr = v->alloc.allocate(type_size_vec4(type));
681
682 if (type->is_array() || type->is_record()) {
683 this->swizzle = BRW_SWIZZLE_NOOP;
684 } else {
685 this->swizzle = brw_swizzle_for_size(type->vector_elements);
686 }
687
688 this->type = brw_type_for_base_type(type);
689 }
690
691 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
692 {
693 assert(size > 0);
694
695 init();
696
697 this->file = VGRF;
698 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
699
700 this->swizzle = BRW_SWIZZLE_NOOP;
701
702 this->type = brw_type_for_base_type(type);
703 }
704
705 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
706 {
707 init();
708
709 this->file = VGRF;
710 this->nr = v->alloc.allocate(type_size_vec4(type));
711
712 if (type->is_array() || type->is_record()) {
713 this->writemask = WRITEMASK_XYZW;
714 } else {
715 this->writemask = (1 << type->vector_elements) - 1;
716 }
717
718 this->type = brw_type_for_base_type(type);
719 }
720
721 vec4_instruction *
722 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
723 src_reg src0, src_reg src1)
724 {
725 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
726 inst->conditional_mod = conditionalmod;
727 return inst;
728 }
729
730 vec4_instruction *
731 vec4_visitor::emit_lrp(const dst_reg &dst,
732 const src_reg &x, const src_reg &y, const src_reg &a)
733 {
734 if (devinfo->gen >= 6) {
735 /* Note that the instruction's argument order is reversed from GLSL
736 * and the IR.
737 */
738 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
739 fix_3src_operand(x)));
740 } else {
741 /* Earlier generations don't support three source operations, so we
742 * need to emit x*(1-a) + y*a.
743 */
744 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
745 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
746 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
747 y_times_a.writemask = dst.writemask;
748 one_minus_a.writemask = dst.writemask;
749 x_times_one_minus_a.writemask = dst.writemask;
750
751 emit(MUL(y_times_a, y, a));
752 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
753 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
754 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
755 }
756 }
757
758 /**
759 * Emits the instructions needed to perform a pull constant load. before_block
760 * and before_inst can be NULL in which case the instruction will be appended
761 * to the end of the instruction list.
762 */
763 void
764 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
765 src_reg surf_index,
766 src_reg offset_reg,
767 bblock_t *before_block,
768 vec4_instruction *before_inst)
769 {
770 assert((before_inst == NULL && before_block == NULL) ||
771 (before_inst && before_block));
772
773 vec4_instruction *pull;
774
775 if (devinfo->gen >= 9) {
776 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
777 src_reg header(this, glsl_type::uvec4_type, 2);
778
779 pull = new(mem_ctx)
780 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
781 dst_reg(header));
782
783 if (before_inst)
784 emit_before(before_block, before_inst, pull);
785 else
786 emit(pull);
787
788 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
789 offset_reg.type);
790 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
791
792 if (before_inst)
793 emit_before(before_block, before_inst, pull);
794 else
795 emit(pull);
796
797 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
798 dst,
799 surf_index,
800 header);
801 pull->mlen = 2;
802 pull->header_size = 1;
803 } else if (devinfo->gen >= 7) {
804 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
805
806 grf_offset.type = offset_reg.type;
807
808 pull = MOV(grf_offset, offset_reg);
809
810 if (before_inst)
811 emit_before(before_block, before_inst, pull);
812 else
813 emit(pull);
814
815 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
816 dst,
817 surf_index,
818 src_reg(grf_offset));
819 pull->mlen = 1;
820 } else {
821 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
822 dst,
823 surf_index,
824 offset_reg);
825 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
826 pull->mlen = 1;
827 }
828
829 if (before_inst)
830 emit_before(before_block, before_inst, pull);
831 else
832 emit(pull);
833 }
834
835 src_reg
836 vec4_visitor::emit_uniformize(const src_reg &src)
837 {
838 const src_reg chan_index(this, glsl_type::uint_type);
839 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
840 src.type);
841
842 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
843 ->force_writemask_all = true;
844 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
845 ->force_writemask_all = true;
846
847 return src_reg(dst);
848 }
849
850 src_reg
851 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
852 src_reg coordinate, src_reg surface)
853 {
854 vec4_instruction *inst =
855 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
856 dst_reg(this, glsl_type::uvec4_type));
857 inst->base_mrf = 2;
858 inst->src[1] = surface;
859 inst->src[2] = surface;
860
861 int param_base;
862
863 if (devinfo->gen >= 9) {
864 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
865 vec4_instruction *header_inst = new(mem_ctx)
866 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
867 dst_reg(MRF, inst->base_mrf));
868
869 emit(header_inst);
870
871 inst->mlen = 2;
872 inst->header_size = 1;
873 param_base = inst->base_mrf + 1;
874 } else {
875 inst->mlen = 1;
876 param_base = inst->base_mrf;
877 }
878
879 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
880 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
881 int zero_mask = 0xf & ~coord_mask;
882
883 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
884 coordinate));
885
886 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
887 brw_imm_d(0)));
888
889 emit(inst);
890 return src_reg(inst->dst);
891 }
892
893 bool
894 vec4_visitor::is_high_sampler(src_reg sampler)
895 {
896 if (devinfo->gen < 8 && !devinfo->is_haswell)
897 return false;
898
899 return sampler.file != IMM || sampler.ud >= 16;
900 }
901
902 void
903 vec4_visitor::emit_texture(ir_texture_opcode op,
904 dst_reg dest,
905 const glsl_type *dest_type,
906 src_reg coordinate,
907 int coord_components,
908 src_reg shadow_comparator,
909 src_reg lod, src_reg lod2,
910 src_reg sample_index,
911 uint32_t constant_offset,
912 src_reg offset_value,
913 src_reg mcs,
914 uint32_t surface,
915 src_reg surface_reg,
916 src_reg sampler_reg)
917 {
918 /* The sampler can only meaningfully compute LOD for fragment shader
919 * messages. For all other stages, we change the opcode to TXL and hardcode
920 * the LOD to 0.
921 *
922 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
923 * valid LOD argument.
924 */
925 if (op == ir_tex || op == ir_query_levels) {
926 assert(lod.file == BAD_FILE);
927 lod = brw_imm_f(0.0f);
928 }
929
930 enum opcode opcode;
931 switch (op) {
932 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
933 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
934 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
935 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
936 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
937 SHADER_OPCODE_TXF_CMS); break;
938 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
939 case ir_tg4: opcode = offset_value.file != BAD_FILE
940 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
941 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
942 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
943 case ir_txb:
944 unreachable("TXB is not valid for vertex shaders.");
945 case ir_lod:
946 unreachable("LOD is not valid for vertex shaders.");
947 case ir_samples_identical: {
948 /* There are some challenges implementing this for vec4, and it seems
949 * unlikely to be used anyway. For now, just return false ways.
950 */
951 emit(MOV(dest, brw_imm_ud(0u)));
952 return;
953 }
954 default:
955 unreachable("Unrecognized tex op");
956 }
957
958 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
959
960 inst->offset = constant_offset;
961
962 /* The message header is necessary for:
963 * - Gen4 (always)
964 * - Gen9+ for selecting SIMD4x2
965 * - Texel offsets
966 * - Gather channel selection
967 * - Sampler indices too large to fit in a 4-bit value.
968 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
969 */
970 inst->header_size =
971 (devinfo->gen < 5 || devinfo->gen >= 9 ||
972 inst->offset != 0 || op == ir_tg4 ||
973 op == ir_texture_samples ||
974 is_high_sampler(sampler_reg)) ? 1 : 0;
975 inst->base_mrf = 2;
976 inst->mlen = inst->header_size;
977 inst->dst.writemask = WRITEMASK_XYZW;
978 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
979
980 inst->src[1] = surface_reg;
981 inst->src[2] = sampler_reg;
982
983 /* MRF for the first parameter */
984 int param_base = inst->base_mrf + inst->header_size;
985
986 if (op == ir_txs || op == ir_query_levels) {
987 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
988 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
989 inst->mlen++;
990 } else if (op == ir_texture_samples) {
991 inst->dst.writemask = WRITEMASK_X;
992 } else {
993 /* Load the coordinate */
994 /* FINISHME: gl_clamp_mask and saturate */
995 int coord_mask = (1 << coord_components) - 1;
996 int zero_mask = 0xf & ~coord_mask;
997
998 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
999 coordinate));
1000 inst->mlen++;
1001
1002 if (zero_mask != 0) {
1003 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1004 brw_imm_d(0)));
1005 }
1006 /* Load the shadow comparator */
1007 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1008 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1009 WRITEMASK_X),
1010 shadow_comparator));
1011 inst->mlen++;
1012 }
1013
1014 /* Load the LOD info */
1015 if (op == ir_tex || op == ir_txl) {
1016 int mrf, writemask;
1017 if (devinfo->gen >= 5) {
1018 mrf = param_base + 1;
1019 if (shadow_comparator.file != BAD_FILE) {
1020 writemask = WRITEMASK_Y;
1021 /* mlen already incremented */
1022 } else {
1023 writemask = WRITEMASK_X;
1024 inst->mlen++;
1025 }
1026 } else /* devinfo->gen == 4 */ {
1027 mrf = param_base;
1028 writemask = WRITEMASK_W;
1029 }
1030 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1031 } else if (op == ir_txf) {
1032 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1033 } else if (op == ir_txf_ms) {
1034 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1035 sample_index));
1036 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1037 /* MCS data is stored in the first two channels of ‘mcs’, but we
1038 * need to get it into the .y and .z channels of the second vec4
1039 * of params.
1040 */
1041 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1042 emit(MOV(dst_reg(MRF, param_base + 1,
1043 glsl_type::uint_type, WRITEMASK_YZ),
1044 mcs));
1045 } else if (devinfo->gen >= 7) {
1046 /* MCS data is in the first channel of `mcs`, but we need to get it into
1047 * the .y channel of the second vec4 of params, so replicate .x across
1048 * the whole vec4 and then mask off everything except .y
1049 */
1050 mcs.swizzle = BRW_SWIZZLE_XXXX;
1051 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1052 mcs));
1053 }
1054 inst->mlen++;
1055 } else if (op == ir_txd) {
1056 const brw_reg_type type = lod.type;
1057
1058 if (devinfo->gen >= 5) {
1059 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1060 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1061 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1062 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1063 inst->mlen++;
1064
1065 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1066 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1067 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1068 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1069 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1070 inst->mlen++;
1071
1072 if (shadow_comparator.file != BAD_FILE) {
1073 emit(MOV(dst_reg(MRF, param_base + 2,
1074 shadow_comparator.type, WRITEMASK_Z),
1075 shadow_comparator));
1076 }
1077 }
1078 } else /* devinfo->gen == 4 */ {
1079 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1080 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1081 inst->mlen += 2;
1082 }
1083 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1084 if (shadow_comparator.file != BAD_FILE) {
1085 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1086 shadow_comparator));
1087 }
1088
1089 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1090 offset_value));
1091 inst->mlen++;
1092 }
1093 }
1094
1095 emit(inst);
1096
1097 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1098 * spec requires layers.
1099 */
1100 if (op == ir_txs && devinfo->gen < 7) {
1101 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1102 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1103 src_reg(inst->dst), brw_imm_d(1));
1104 }
1105
1106 if (devinfo->gen == 6 && op == ir_tg4) {
1107 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1108 }
1109
1110 if (op == ir_query_levels) {
1111 /* # levels is in .w */
1112 src_reg swizzled(dest);
1113 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1114 SWIZZLE_W, SWIZZLE_W);
1115 emit(MOV(dest, swizzled));
1116 }
1117 }
1118
1119 /**
1120 * Apply workarounds for Gen6 gather with UINT/SINT
1121 */
1122 void
1123 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1124 {
1125 if (!wa)
1126 return;
1127
1128 int width = (wa & WA_8BIT) ? 8 : 16;
1129 dst_reg dst_f = dst;
1130 dst_f.type = BRW_REGISTER_TYPE_F;
1131
1132 /* Convert from UNORM to UINT */
1133 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1134 emit(MOV(dst, src_reg(dst_f)));
1135
1136 if (wa & WA_SIGN) {
1137 /* Reinterpret the UINT value as a signed INT value by
1138 * shifting the sign bit into place, then shifting back
1139 * preserving sign.
1140 */
1141 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1142 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1143 }
1144 }
1145
1146 void
1147 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1148 {
1149 unreachable("not reached");
1150 }
1151
1152 void
1153 vec4_visitor::gs_end_primitive()
1154 {
1155 unreachable("not reached");
1156 }
1157
1158 void
1159 vec4_visitor::emit_ndc_computation()
1160 {
1161 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1162 return;
1163
1164 /* Get the position */
1165 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1166
1167 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1168 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1169 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1170 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1171
1172 current_annotation = "NDC";
1173 dst_reg ndc_w = ndc;
1174 ndc_w.writemask = WRITEMASK_W;
1175 src_reg pos_w = pos;
1176 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1177 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1178
1179 dst_reg ndc_xyz = ndc;
1180 ndc_xyz.writemask = WRITEMASK_XYZ;
1181
1182 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1183 }
1184
1185 void
1186 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1187 {
1188 if (devinfo->gen < 6 &&
1189 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1190 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1191 devinfo->has_negative_rhw_bug)) {
1192 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1193 dst_reg header1_w = header1;
1194 header1_w.writemask = WRITEMASK_W;
1195
1196 emit(MOV(header1, brw_imm_ud(0u)));
1197
1198 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1199 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1200
1201 current_annotation = "Point size";
1202 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1203 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1204 }
1205
1206 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1207 current_annotation = "Clipping flags";
1208 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1209 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1210
1211 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1212 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1213 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1214
1215 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1216 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1217 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1218 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1219 }
1220
1221 /* i965 clipping workaround:
1222 * 1) Test for -ve rhw
1223 * 2) If set,
1224 * set ndc = (0,0,0,0)
1225 * set ucp[6] = 1
1226 *
1227 * Later, clipping will detect ucp[6] and ensure the primitive is
1228 * clipped against all fixed planes.
1229 */
1230 if (devinfo->has_negative_rhw_bug &&
1231 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1232 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1233 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1234 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1235 vec4_instruction *inst;
1236 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1237 inst->predicate = BRW_PREDICATE_NORMAL;
1238 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1239 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1240 inst->predicate = BRW_PREDICATE_NORMAL;
1241 }
1242
1243 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1244 } else if (devinfo->gen < 6) {
1245 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1246 } else {
1247 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1248 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1249 dst_reg reg_w = reg;
1250 reg_w.writemask = WRITEMASK_W;
1251 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1252 reg_as_src.type = reg_w.type;
1253 reg_as_src.swizzle = brw_swizzle_for_size(1);
1254 emit(MOV(reg_w, reg_as_src));
1255 }
1256 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1257 dst_reg reg_y = reg;
1258 reg_y.writemask = WRITEMASK_Y;
1259 reg_y.type = BRW_REGISTER_TYPE_D;
1260 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1261 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1262 }
1263 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1264 dst_reg reg_z = reg;
1265 reg_z.writemask = WRITEMASK_Z;
1266 reg_z.type = BRW_REGISTER_TYPE_D;
1267 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1268 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1269 }
1270 }
1271 }
1272
1273 vec4_instruction *
1274 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1275 {
1276 assert(varying < VARYING_SLOT_MAX);
1277
1278 unsigned num_comps = output_num_components[varying][component];
1279 if (num_comps == 0)
1280 return NULL;
1281
1282 assert(output_reg[varying][component].type == reg.type);
1283 current_annotation = output_reg_annotation[varying];
1284 if (output_reg[varying][component].file != BAD_FILE) {
1285 src_reg src = src_reg(output_reg[varying][component]);
1286 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1287 reg.writemask =
1288 brw_writemask_for_component_packing(num_comps, component);
1289 return emit(MOV(reg, src));
1290 }
1291 return NULL;
1292 }
1293
1294 void
1295 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1296 {
1297 reg.type = BRW_REGISTER_TYPE_F;
1298 output_reg[varying][0].type = reg.type;
1299
1300 switch (varying) {
1301 case VARYING_SLOT_PSIZ:
1302 {
1303 /* PSIZ is always in slot 0, and is coupled with other flags. */
1304 current_annotation = "indices, point width, clip flags";
1305 emit_psiz_and_flags(reg);
1306 break;
1307 }
1308 case BRW_VARYING_SLOT_NDC:
1309 current_annotation = "NDC";
1310 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1311 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1312 break;
1313 case VARYING_SLOT_POS:
1314 current_annotation = "gl_Position";
1315 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1316 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1317 break;
1318 case VARYING_SLOT_EDGE: {
1319 /* This is present when doing unfilled polygons. We're supposed to copy
1320 * the edge flag from the user-provided vertex array
1321 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1322 * of that attribute (starts as 1.0f). This is then used in clipping to
1323 * determine which edges should be drawn as wireframe.
1324 */
1325 current_annotation = "edge flag";
1326 int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
1327 BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1328 emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1329 glsl_type::float_type, WRITEMASK_XYZW))));
1330 break;
1331 }
1332 case BRW_VARYING_SLOT_PAD:
1333 /* No need to write to this slot */
1334 break;
1335 default:
1336 for (int i = 0; i < 4; i++) {
1337 emit_generic_urb_slot(reg, varying, i);
1338 }
1339 break;
1340 }
1341 }
1342
1343 static int
1344 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1345 {
1346 if (devinfo->gen >= 6) {
1347 /* URB data written (does not include the message header reg) must
1348 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1349 * section 5.4.3.2.2: URB_INTERLEAVED.
1350 *
1351 * URB entries are allocated on a multiple of 1024 bits, so an
1352 * extra 128 bits written here to make the end align to 256 is
1353 * no problem.
1354 */
1355 if ((mlen % 2) != 1)
1356 mlen++;
1357 }
1358
1359 return mlen;
1360 }
1361
1362
1363 /**
1364 * Generates the VUE payload plus the necessary URB write instructions to
1365 * output it.
1366 *
1367 * The VUE layout is documented in Volume 2a.
1368 */
1369 void
1370 vec4_visitor::emit_vertex()
1371 {
1372 /* MRF 0 is reserved for the debugger, so start with message header
1373 * in MRF 1.
1374 */
1375 int base_mrf = 1;
1376 int mrf = base_mrf;
1377 /* In the process of generating our URB write message contents, we
1378 * may need to unspill a register or load from an array. Those
1379 * reads would use MRFs 14-15.
1380 */
1381 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1382
1383 /* The following assertion verifies that max_usable_mrf causes an
1384 * even-numbered amount of URB write data, which will meet gen6's
1385 * requirements for length alignment.
1386 */
1387 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1388
1389 /* First mrf is the g0-based message header containing URB handles and
1390 * such.
1391 */
1392 emit_urb_write_header(mrf++);
1393
1394 if (devinfo->gen < 6) {
1395 emit_ndc_computation();
1396 }
1397
1398 /* We may need to split this up into several URB writes, so do them in a
1399 * loop.
1400 */
1401 int slot = 0;
1402 bool complete = false;
1403 do {
1404 /* URB offset is in URB row increments, and each of our MRFs is half of
1405 * one of those, since we're doing interleaved writes.
1406 */
1407 int offset = slot / 2;
1408
1409 mrf = base_mrf + 1;
1410 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1411 emit_urb_slot(dst_reg(MRF, mrf++),
1412 prog_data->vue_map.slot_to_varying[slot]);
1413
1414 /* If this was max_usable_mrf, we can't fit anything more into this
1415 * URB WRITE. Same thing if we reached the maximum length available.
1416 */
1417 if (mrf > max_usable_mrf ||
1418 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1419 slot++;
1420 break;
1421 }
1422 }
1423
1424 complete = slot >= prog_data->vue_map.num_slots;
1425 current_annotation = "URB write";
1426 vec4_instruction *inst = emit_urb_write_opcode(complete);
1427 inst->base_mrf = base_mrf;
1428 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1429 inst->offset += offset;
1430 } while(!complete);
1431 }
1432
1433
1434 src_reg
1435 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1436 src_reg *reladdr, int reg_offset)
1437 {
1438 /* Because we store the values to scratch interleaved like our
1439 * vertex data, we need to scale the vec4 index by 2.
1440 */
1441 int message_header_scale = 2;
1442
1443 /* Pre-gen6, the message header uses byte offsets instead of vec4
1444 * (16-byte) offset units.
1445 */
1446 if (devinfo->gen < 6)
1447 message_header_scale *= 16;
1448
1449 if (reladdr) {
1450 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1451 * to multiply the reladdr by 2. Notice that the reg_offset part
1452 * is in units of 16 bytes and is used to select the low/high 16-byte
1453 * chunk of a full dvec4, so we don't want to multiply that part.
1454 */
1455 src_reg index = src_reg(this, glsl_type::int_type);
1456 if (type_sz(inst->dst.type) < 8) {
1457 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1458 brw_imm_d(reg_offset)));
1459 emit_before(block, inst, MUL(dst_reg(index), index,
1460 brw_imm_d(message_header_scale)));
1461 } else {
1462 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1463 brw_imm_d(message_header_scale * 2)));
1464 emit_before(block, inst, ADD(dst_reg(index), index,
1465 brw_imm_d(reg_offset * message_header_scale)));
1466 }
1467 return index;
1468 } else {
1469 return brw_imm_d(reg_offset * message_header_scale);
1470 }
1471 }
1472
1473 /**
1474 * Emits an instruction before @inst to load the value named by @orig_src
1475 * from scratch space at @base_offset to @temp.
1476 *
1477 * @base_offset is measured in 32-byte units (the size of a register).
1478 */
1479 void
1480 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1481 dst_reg temp, src_reg orig_src,
1482 int base_offset)
1483 {
1484 assert(orig_src.offset % REG_SIZE == 0);
1485 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1486 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1487 reg_offset);
1488
1489 if (type_sz(orig_src.type) < 8) {
1490 emit_before(block, inst, SCRATCH_READ(temp, index));
1491 } else {
1492 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1493 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1494 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1495 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1496 vec4_instruction *last_read =
1497 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1498 emit_before(block, inst, last_read);
1499 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1500 }
1501 }
1502
1503 /**
1504 * Emits an instruction after @inst to store the value to be written
1505 * to @orig_dst to scratch space at @base_offset, from @temp.
1506 *
1507 * @base_offset is measured in 32-byte units (the size of a register).
1508 */
1509 void
1510 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1511 int base_offset)
1512 {
1513 assert(inst->dst.offset % REG_SIZE == 0);
1514 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1515 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1516 reg_offset);
1517
1518 /* Create a temporary register to store *inst's result in.
1519 *
1520 * We have to be careful in MOVing from our temporary result register in
1521 * the scratch write. If we swizzle from channels of the temporary that
1522 * weren't initialized, it will confuse live interval analysis, which will
1523 * make spilling fail to make progress.
1524 */
1525 bool is_64bit = type_sz(inst->dst.type) == 8;
1526 const glsl_type *alloc_type =
1527 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1528 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1529 inst->dst.type),
1530 brw_swizzle_for_mask(inst->dst.writemask));
1531
1532 if (!is_64bit) {
1533 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1534 inst->dst.writemask));
1535 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1536 if (inst->opcode != BRW_OPCODE_SEL)
1537 write->predicate = inst->predicate;
1538 write->ir = inst->ir;
1539 write->annotation = inst->annotation;
1540 inst->insert_after(block, write);
1541 } else {
1542 dst_reg shuffled = dst_reg(this, alloc_type);
1543 vec4_instruction *last =
1544 shuffle_64bit_data(shuffled, temp, true, block, inst);
1545 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1546
1547 uint8_t mask = 0;
1548 if (inst->dst.writemask & WRITEMASK_X)
1549 mask |= WRITEMASK_XY;
1550 if (inst->dst.writemask & WRITEMASK_Y)
1551 mask |= WRITEMASK_ZW;
1552 if (mask) {
1553 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1554
1555 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1556 if (inst->opcode != BRW_OPCODE_SEL)
1557 write->predicate = inst->predicate;
1558 write->ir = inst->ir;
1559 write->annotation = inst->annotation;
1560 last->insert_after(block, write);
1561 }
1562
1563 mask = 0;
1564 if (inst->dst.writemask & WRITEMASK_Z)
1565 mask |= WRITEMASK_XY;
1566 if (inst->dst.writemask & WRITEMASK_W)
1567 mask |= WRITEMASK_ZW;
1568 if (mask) {
1569 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1570
1571 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1572 reg_offset + 1);
1573 vec4_instruction *write =
1574 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1575 if (inst->opcode != BRW_OPCODE_SEL)
1576 write->predicate = inst->predicate;
1577 write->ir = inst->ir;
1578 write->annotation = inst->annotation;
1579 last->insert_after(block, write);
1580 }
1581 }
1582
1583 inst->dst.file = temp.file;
1584 inst->dst.nr = temp.nr;
1585 inst->dst.offset %= REG_SIZE;
1586 inst->dst.reladdr = NULL;
1587 }
1588
1589 /**
1590 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1591 * adds the scratch read(s) before \p inst. The function also checks for
1592 * recursive reladdr scratch accesses, issuing the corresponding scratch
1593 * loads and rewriting reladdr references accordingly.
1594 *
1595 * \return \p src if it did not require a scratch load, otherwise, the
1596 * register holding the result of the scratch load that the caller should
1597 * use to rewrite src.
1598 */
1599 src_reg
1600 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1601 vec4_instruction *inst, src_reg src)
1602 {
1603 /* Resolve recursive reladdr scratch access by calling ourselves
1604 * with src.reladdr
1605 */
1606 if (src.reladdr)
1607 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1608 *src.reladdr);
1609
1610 /* Now handle scratch access on src */
1611 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1612 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1613 glsl_type::dvec4_type : glsl_type::vec4_type);
1614 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1615 src.nr = temp.nr;
1616 src.offset %= REG_SIZE;
1617 src.reladdr = NULL;
1618 }
1619
1620 return src;
1621 }
1622
1623 /**
1624 * We can't generally support array access in GRF space, because a
1625 * single instruction's destination can only span 2 contiguous
1626 * registers. So, we send all GRF arrays that get variable index
1627 * access to scratch space.
1628 */
1629 void
1630 vec4_visitor::move_grf_array_access_to_scratch()
1631 {
1632 int scratch_loc[this->alloc.count];
1633 memset(scratch_loc, -1, sizeof(scratch_loc));
1634
1635 /* First, calculate the set of virtual GRFs that need to be punted
1636 * to scratch due to having any array access on them, and where in
1637 * scratch.
1638 */
1639 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1640 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1641 if (scratch_loc[inst->dst.nr] == -1) {
1642 scratch_loc[inst->dst.nr] = last_scratch;
1643 last_scratch += this->alloc.sizes[inst->dst.nr];
1644 }
1645
1646 for (src_reg *iter = inst->dst.reladdr;
1647 iter->reladdr;
1648 iter = iter->reladdr) {
1649 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1650 scratch_loc[iter->nr] = last_scratch;
1651 last_scratch += this->alloc.sizes[iter->nr];
1652 }
1653 }
1654 }
1655
1656 for (int i = 0 ; i < 3; i++) {
1657 for (src_reg *iter = &inst->src[i];
1658 iter->reladdr;
1659 iter = iter->reladdr) {
1660 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1661 scratch_loc[iter->nr] = last_scratch;
1662 last_scratch += this->alloc.sizes[iter->nr];
1663 }
1664 }
1665 }
1666 }
1667
1668 /* Now, for anything that will be accessed through scratch, rewrite
1669 * it to load/store. Note that this is a _safe list walk, because
1670 * we may generate a new scratch_write instruction after the one
1671 * we're processing.
1672 */
1673 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1674 /* Set up the annotation tracking for new generated instructions. */
1675 base_ir = inst->ir;
1676 current_annotation = inst->annotation;
1677
1678 /* First handle scratch access on the dst. Notice we have to handle
1679 * the case where the dst's reladdr also points to scratch space.
1680 */
1681 if (inst->dst.reladdr)
1682 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1683 *inst->dst.reladdr);
1684
1685 /* Now that we have handled any (possibly recursive) reladdr scratch
1686 * accesses for dst we can safely do the scratch write for dst itself
1687 */
1688 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1689 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1690
1691 /* Now handle scratch access on any src. In this case, since inst->src[i]
1692 * already is a src_reg, we can just call emit_resolve_reladdr with
1693 * inst->src[i] and it will take care of handling scratch loads for
1694 * both src and src.reladdr (recursively).
1695 */
1696 for (int i = 0 ; i < 3; i++) {
1697 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1698 inst->src[i]);
1699 }
1700 }
1701 }
1702
1703 /**
1704 * Emits an instruction before @inst to load the value named by @orig_src
1705 * from the pull constant buffer (surface) at @base_offset to @temp.
1706 */
1707 void
1708 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1709 dst_reg temp, src_reg orig_src,
1710 int base_offset, src_reg indirect)
1711 {
1712 assert(orig_src.offset % 16 == 0);
1713 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1714
1715 /* For 64bit loads we need to emit two 32-bit load messages and we also
1716 * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1717 * that we emit the 32-bit loads into a temporary and we shuffle the result
1718 * into the original destination.
1719 */
1720 dst_reg orig_temp = temp;
1721 bool is_64bit = type_sz(orig_src.type) == 8;
1722 if (is_64bit) {
1723 assert(type_sz(temp.type) == 8);
1724 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1725 temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1726 }
1727
1728 src_reg src = orig_src;
1729 for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1730 int reg_offset = base_offset + src.offset / 16;
1731
1732 src_reg offset;
1733 if (indirect.file != BAD_FILE) {
1734 offset = src_reg(this, glsl_type::uint_type);
1735 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1736 brw_imm_ud(reg_offset * 16)));
1737 } else if (devinfo->gen >= 8) {
1738 /* Store the offset in a GRF so we can send-from-GRF. */
1739 offset = src_reg(this, glsl_type::uint_type);
1740 emit_before(block, inst, MOV(dst_reg(offset),
1741 brw_imm_ud(reg_offset * 16)));
1742 } else {
1743 offset = brw_imm_d(reg_offset * 16);
1744 }
1745
1746 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1747 brw_imm_ud(index),
1748 offset,
1749 block, inst);
1750
1751 src = byte_offset(src, 16);
1752 }
1753
1754 brw_mark_surface_used(&prog_data->base, index);
1755
1756 if (is_64bit) {
1757 temp = retype(temp, BRW_REGISTER_TYPE_DF);
1758 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1759 }
1760 }
1761
1762 /**
1763 * Implements array access of uniforms by inserting a
1764 * PULL_CONSTANT_LOAD instruction.
1765 *
1766 * Unlike temporary GRF array access (where we don't support it due to
1767 * the difficulty of doing relative addressing on instruction
1768 * destinations), we could potentially do array access of uniforms
1769 * that were loaded in GRF space as push constants. In real-world
1770 * usage we've seen, though, the arrays being used are always larger
1771 * than we could load as push constants, so just always move all
1772 * uniform array access out to a pull constant buffer.
1773 */
1774 void
1775 vec4_visitor::move_uniform_array_access_to_pull_constants()
1776 {
1777 /* The vulkan dirver doesn't support pull constants other than UBOs so
1778 * everything has to be pushed regardless.
1779 */
1780 if (stage_prog_data->pull_param == NULL) {
1781 split_uniform_registers();
1782 return;
1783 }
1784
1785 int pull_constant_loc[this->uniforms];
1786 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1787
1788 /* First, walk through the instructions and determine which things need to
1789 * be pulled. We mark something as needing to be pulled by setting
1790 * pull_constant_loc to 0.
1791 */
1792 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1793 /* We only care about MOV_INDIRECT of a uniform */
1794 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1795 inst->src[0].file != UNIFORM)
1796 continue;
1797
1798 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1799
1800 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1801 pull_constant_loc[uniform_nr + j] = 0;
1802 }
1803
1804 /* Next, we walk the list of uniforms and assign real pull constant
1805 * locations and set their corresponding entries in pull_param.
1806 */
1807 for (int j = 0; j < this->uniforms; j++) {
1808 if (pull_constant_loc[j] < 0)
1809 continue;
1810
1811 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1812
1813 for (int i = 0; i < 4; i++) {
1814 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1815 = stage_prog_data->param[j * 4 + i];
1816 }
1817 }
1818
1819 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1820 * instructions to actual uniform pulls.
1821 */
1822 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1823 /* We only care about MOV_INDIRECT of a uniform */
1824 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1825 inst->src[0].file != UNIFORM)
1826 continue;
1827
1828 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1829
1830 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1831
1832 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1833 pull_constant_loc[uniform_nr], inst->src[1]);
1834 inst->remove(block);
1835 }
1836
1837 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1838 * no need to track them as larger-than-vec4 objects. This will be
1839 * relied on in cutting out unused uniform vectors from push
1840 * constants.
1841 */
1842 split_uniform_registers();
1843 }
1844
1845 void
1846 vec4_visitor::resolve_ud_negate(src_reg *reg)
1847 {
1848 if (reg->type != BRW_REGISTER_TYPE_UD ||
1849 !reg->negate)
1850 return;
1851
1852 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1853 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1854 *reg = temp;
1855 }
1856
1857 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1858 void *log_data,
1859 const struct brw_sampler_prog_key_data *key_tex,
1860 struct brw_vue_prog_data *prog_data,
1861 const nir_shader *shader,
1862 void *mem_ctx,
1863 bool no_spills,
1864 int shader_time_index)
1865 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1866 key_tex(key_tex),
1867 prog_data(prog_data),
1868 fail_msg(NULL),
1869 first_non_payload_grf(0),
1870 need_all_constants_in_pull_buffer(false),
1871 no_spills(no_spills),
1872 shader_time_index(shader_time_index),
1873 last_scratch(0)
1874 {
1875 this->failed = false;
1876
1877 this->base_ir = NULL;
1878 this->current_annotation = NULL;
1879 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1880
1881 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1882
1883 this->virtual_grf_start = NULL;
1884 this->virtual_grf_end = NULL;
1885 this->live_intervals = NULL;
1886
1887 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1888
1889 this->uniforms = 0;
1890 }
1891
1892 vec4_visitor::~vec4_visitor()
1893 {
1894 }
1895
1896
1897 void
1898 vec4_visitor::fail(const char *format, ...)
1899 {
1900 va_list va;
1901 char *msg;
1902
1903 if (failed)
1904 return;
1905
1906 failed = true;
1907
1908 va_start(va, format);
1909 msg = ralloc_vasprintf(mem_ctx, format, va);
1910 va_end(va);
1911 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1912
1913 this->fail_msg = msg;
1914
1915 if (debug_enabled) {
1916 fprintf(stderr, "%s", msg);
1917 }
1918 }
1919
1920 } /* namespace brw */