intel/compiler: Move the destructor from vec4_visitor to backend_shader
[mesa.git] / src / intel / compiler / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27
28 namespace brw {
29
30 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
31 const src_reg &src0, const src_reg &src1,
32 const src_reg &src2)
33 {
34 this->opcode = opcode;
35 this->dst = dst;
36 this->src[0] = src0;
37 this->src[1] = src1;
38 this->src[2] = src2;
39 this->saturate = false;
40 this->force_writemask_all = false;
41 this->no_dd_clear = false;
42 this->no_dd_check = false;
43 this->writes_accumulator = false;
44 this->conditional_mod = BRW_CONDITIONAL_NONE;
45 this->predicate = BRW_PREDICATE_NONE;
46 this->predicate_inverse = false;
47 this->target = 0;
48 this->shadow_compare = false;
49 this->eot = false;
50 this->ir = NULL;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_size = 0;
53 this->flag_subreg = 0;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->exec_size = 8;
58 this->group = 0;
59 this->size_written = (dst.file == BAD_FILE ?
60 0 : this->exec_size * type_sz(dst.type));
61 this->annotation = NULL;
62 }
63
64 vec4_instruction *
65 vec4_visitor::emit(vec4_instruction *inst)
66 {
67 inst->ir = this->base_ir;
68 inst->annotation = this->current_annotation;
69
70 this->instructions.push_tail(inst);
71
72 return inst;
73 }
74
75 vec4_instruction *
76 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
77 vec4_instruction *new_inst)
78 {
79 new_inst->ir = inst->ir;
80 new_inst->annotation = inst->annotation;
81
82 inst->insert_before(block, new_inst);
83
84 return inst;
85 }
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
89 const src_reg &src1, const src_reg &src2)
90 {
91 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
92 }
93
94
95 vec4_instruction *
96 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
97 const src_reg &src1)
98 {
99 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
104 {
105 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
110 {
111 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
112 }
113
114 vec4_instruction *
115 vec4_visitor::emit(enum opcode opcode)
116 {
117 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
118 }
119
120 #define ALU1(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
123 { \
124 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
125 }
126
127 #define ALU2(op) \
128 vec4_instruction * \
129 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
130 const src_reg &src1) \
131 { \
132 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
133 src0, src1); \
134 }
135
136 #define ALU2_ACC(op) \
137 vec4_instruction * \
138 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
139 const src_reg &src1) \
140 { \
141 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
142 BRW_OPCODE_##op, dst, src0, src1); \
143 inst->writes_accumulator = true; \
144 return inst; \
145 }
146
147 #define ALU3(op) \
148 vec4_instruction * \
149 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
150 const src_reg &src1, const src_reg &src2) \
151 { \
152 assert(devinfo->gen >= 6); \
153 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
154 src0, src1, src2); \
155 }
156
157 ALU1(NOT)
158 ALU1(MOV)
159 ALU1(FRC)
160 ALU1(RNDD)
161 ALU1(RNDE)
162 ALU1(RNDZ)
163 ALU1(F32TO16)
164 ALU1(F16TO32)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2_ACC(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(DP3)
172 ALU2(DP4)
173 ALU2(DPH)
174 ALU2(SHL)
175 ALU2(SHR)
176 ALU2(ASR)
177 ALU3(LRP)
178 ALU1(BFREV)
179 ALU3(BFE)
180 ALU2(BFI1)
181 ALU3(BFI2)
182 ALU1(FBH)
183 ALU1(FBL)
184 ALU1(CBIT)
185 ALU3(MAD)
186 ALU2_ACC(ADDC)
187 ALU2_ACC(SUBB)
188 ALU2(MAC)
189 ALU1(DIM)
190
191 /** Gen4 predicated IF. */
192 vec4_instruction *
193 vec4_visitor::IF(enum brw_predicate predicate)
194 {
195 vec4_instruction *inst;
196
197 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
198 inst->predicate = predicate;
199
200 return inst;
201 }
202
203 /** Gen6 IF with embedded comparison. */
204 vec4_instruction *
205 vec4_visitor::IF(src_reg src0, src_reg src1,
206 enum brw_conditional_mod condition)
207 {
208 assert(devinfo->gen == 6);
209
210 vec4_instruction *inst;
211
212 resolve_ud_negate(&src0);
213 resolve_ud_negate(&src1);
214
215 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
216 src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 /**
223 * CMP: Sets the low bit of the destination channels with the result
224 * of the comparison, while the upper bits are undefined, and updates
225 * the flag register with the packed 16 bits of the result.
226 */
227 vec4_instruction *
228 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
229 enum brw_conditional_mod condition)
230 {
231 vec4_instruction *inst;
232
233 /* Take the instruction:
234 *
235 * CMP null<d> src0<f> src1<f>
236 *
237 * Original gen4 does type conversion to the destination type before
238 * comparison, producing garbage results for floating point comparisons.
239 *
240 * The destination type doesn't matter on newer generations, so we set the
241 * type to match src0 so we can compact the instruction.
242 */
243 dst.type = src0.type;
244
245 resolve_ud_negate(&src0);
246 resolve_ud_negate(&src1);
247
248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249 inst->conditional_mod = condition;
250
251 return inst;
252 }
253
254 vec4_instruction *
255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257 vec4_instruction *inst;
258
259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 dst, index);
261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262 inst->mlen = 2;
263
264 return inst;
265 }
266
267 vec4_instruction *
268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269 const src_reg &index)
270 {
271 vec4_instruction *inst;
272
273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 dst, src, index);
275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276 inst->mlen = 3;
277
278 return inst;
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(const src_reg &src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::resolve_source_modifiers(const src_reg &src)
309 {
310 if (!src.abs && !src.negate)
311 return src;
312
313 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
314 resolved.type = src.type;
315 emit(MOV(resolved, src));
316
317 return src_reg(resolved);
318 }
319
320 src_reg
321 vec4_visitor::fix_math_operand(const src_reg &src)
322 {
323 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
324 return src;
325
326 /* The gen6 math instruction ignores the source modifiers --
327 * swizzle, abs, negate, and at least some parts of the register
328 * region description.
329 *
330 * Rather than trying to enumerate all these cases, *always* expand the
331 * operand to a temp GRF for gen6.
332 *
333 * For gen7, keep the operand as-is, except if immediate, which gen7 still
334 * can't use.
335 */
336
337 if (devinfo->gen == 7 && src.file != IMM)
338 return src;
339
340 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
341 expanded.type = src.type;
342 emit(MOV(expanded, src));
343 return src_reg(expanded);
344 }
345
346 vec4_instruction *
347 vec4_visitor::emit_math(enum opcode opcode,
348 const dst_reg &dst,
349 const src_reg &src0, const src_reg &src1)
350 {
351 vec4_instruction *math =
352 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
353
354 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
355 /* MATH on Gen6 must be align1, so we can't do writemasks. */
356 math->dst = dst_reg(this, glsl_type::vec4_type);
357 math->dst.type = dst.type;
358 math = emit(MOV(dst, src_reg(math->dst)));
359 } else if (devinfo->gen < 6) {
360 math->base_mrf = 1;
361 math->mlen = src1.file == BAD_FILE ? 1 : 2;
362 }
363
364 return math;
365 }
366
367 void
368 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
369 {
370 if (devinfo->gen < 7) {
371 unreachable("ir_unop_pack_half_2x16 should be lowered");
372 }
373
374 assert(dst.type == BRW_REGISTER_TYPE_UD);
375 assert(src0.type == BRW_REGISTER_TYPE_F);
376
377 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
378 *
379 * Because this instruction does not have a 16-bit floating-point type,
380 * the destination data type must be Word (W).
381 *
382 * The destination must be DWord-aligned and specify a horizontal stride
383 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
384 * each destination channel and the upper word is not modified.
385 *
386 * The above restriction implies that the f32to16 instruction must use
387 * align1 mode, because only in align1 mode is it possible to specify
388 * horizontal stride. We choose here to defy the hardware docs and emit
389 * align16 instructions.
390 *
391 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
392 * instructions. I was partially successful in that the code passed all
393 * tests. However, the code was dubiously correct and fragile, and the
394 * tests were not harsh enough to probe that frailty. Not trusting the
395 * code, I chose instead to remain in align16 mode in defiance of the hw
396 * docs).
397 *
398 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
399 * simulator, emitting a f32to16 in align16 mode with UD as destination
400 * data type is safe. The behavior differs from that specified in the PRM
401 * in that the upper word of each destination channel is cleared to 0.
402 */
403
404 dst_reg tmp_dst(this, glsl_type::uvec2_type);
405 src_reg tmp_src(tmp_dst);
406
407 #if 0
408 /* Verify the undocumented behavior on which the following instructions
409 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
410 * then the result of the bit-or instruction below will be incorrect.
411 *
412 * You should inspect the disasm output in order to verify that the MOV is
413 * not optimized away.
414 */
415 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
416 #endif
417
418 /* Give tmp the form below, where "." means untouched.
419 *
420 * w z y x w z y x
421 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
422 *
423 * That the upper word of each write-channel be 0 is required for the
424 * following bit-shift and bit-or instructions to work. Note that this
425 * relies on the undocumented hardware behavior mentioned above.
426 */
427 tmp_dst.writemask = WRITEMASK_XY;
428 emit(F32TO16(tmp_dst, src0));
429
430 /* Give the write-channels of dst the form:
431 * 0xhhhh0000
432 */
433 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
434 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
435
436 /* Finally, give the write-channels of dst the form of packHalf2x16's
437 * output:
438 * 0xhhhhllll
439 */
440 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
441 emit(OR(dst, src_reg(dst), tmp_src));
442 }
443
444 void
445 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
446 {
447 if (devinfo->gen < 7) {
448 unreachable("ir_unop_unpack_half_2x16 should be lowered");
449 }
450
451 assert(dst.type == BRW_REGISTER_TYPE_F);
452 assert(src0.type == BRW_REGISTER_TYPE_UD);
453
454 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
455 *
456 * Because this instruction does not have a 16-bit floating-point type,
457 * the source data type must be Word (W). The destination type must be
458 * F (Float).
459 *
460 * To use W as the source data type, we must adjust horizontal strides,
461 * which is only possible in align1 mode. All my [chadv] attempts at
462 * emitting align1 instructions for unpackHalf2x16 failed to pass the
463 * Piglit tests, so I gave up.
464 *
465 * I've verified that, on gen7 hardware and the simulator, it is safe to
466 * emit f16to32 in align16 mode with UD as source data type.
467 */
468
469 dst_reg tmp_dst(this, glsl_type::uvec2_type);
470 src_reg tmp_src(tmp_dst);
471
472 tmp_dst.writemask = WRITEMASK_X;
473 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
474
475 tmp_dst.writemask = WRITEMASK_Y;
476 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
477
478 dst.writemask = WRITEMASK_XY;
479 emit(F16TO32(dst, tmp_src));
480 }
481
482 void
483 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
484 {
485 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
486 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
487 * is not suitable to generate the shift values, but we can use the packed
488 * vector float and a type-converting MOV.
489 */
490 dst_reg shift(this, glsl_type::uvec4_type);
491 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
492
493 dst_reg shifted(this, glsl_type::uvec4_type);
494 src0.swizzle = BRW_SWIZZLE_XXXX;
495 emit(SHR(shifted, src0, src_reg(shift)));
496
497 shifted.type = BRW_REGISTER_TYPE_UB;
498 dst_reg f(this, glsl_type::vec4_type);
499 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
500
501 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
502 }
503
504 void
505 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
506 {
507 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
508 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
509 * is not suitable to generate the shift values, but we can use the packed
510 * vector float and a type-converting MOV.
511 */
512 dst_reg shift(this, glsl_type::uvec4_type);
513 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
514
515 dst_reg shifted(this, glsl_type::uvec4_type);
516 src0.swizzle = BRW_SWIZZLE_XXXX;
517 emit(SHR(shifted, src0, src_reg(shift)));
518
519 shifted.type = BRW_REGISTER_TYPE_B;
520 dst_reg f(this, glsl_type::vec4_type);
521 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
522
523 dst_reg scaled(this, glsl_type::vec4_type);
524 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
525
526 dst_reg max(this, glsl_type::vec4_type);
527 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
528 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
529 }
530
531 void
532 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
533 {
534 dst_reg saturated(this, glsl_type::vec4_type);
535 vec4_instruction *inst = emit(MOV(saturated, src0));
536 inst->saturate = true;
537
538 dst_reg scaled(this, glsl_type::vec4_type);
539 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
540
541 dst_reg rounded(this, glsl_type::vec4_type);
542 emit(RNDE(rounded, src_reg(scaled)));
543
544 dst_reg u(this, glsl_type::uvec4_type);
545 emit(MOV(u, src_reg(rounded)));
546
547 src_reg bytes(u);
548 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
549 }
550
551 void
552 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
553 {
554 dst_reg max(this, glsl_type::vec4_type);
555 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
556
557 dst_reg min(this, glsl_type::vec4_type);
558 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
559
560 dst_reg scaled(this, glsl_type::vec4_type);
561 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
562
563 dst_reg rounded(this, glsl_type::vec4_type);
564 emit(RNDE(rounded, src_reg(scaled)));
565
566 dst_reg i(this, glsl_type::ivec4_type);
567 emit(MOV(i, src_reg(rounded)));
568
569 src_reg bytes(i);
570 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
571 }
572
573 /*
574 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
575 * false) elements needed to pack a type.
576 */
577 static int
578 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
579 {
580 unsigned int i;
581 int size;
582
583 switch (type->base_type) {
584 case GLSL_TYPE_UINT:
585 case GLSL_TYPE_INT:
586 case GLSL_TYPE_FLOAT:
587 case GLSL_TYPE_BOOL:
588 case GLSL_TYPE_DOUBLE:
589 case GLSL_TYPE_UINT64:
590 case GLSL_TYPE_INT64:
591 if (type->is_matrix()) {
592 const glsl_type *col_type = type->column_type();
593 unsigned col_slots =
594 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
595 return type->matrix_columns * col_slots;
596 } else {
597 /* Regardless of size of vector, it gets a vec4. This is bad
598 * packing for things like floats, but otherwise arrays become a
599 * mess. Hopefully a later pass over the code can pack scalars
600 * down if appropriate.
601 */
602 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
603 }
604 case GLSL_TYPE_ARRAY:
605 assert(type->length > 0);
606 return type_size_xvec4(type->fields.array, as_vec4) * type->length;
607 case GLSL_TYPE_STRUCT:
608 size = 0;
609 for (i = 0; i < type->length; i++) {
610 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
611 }
612 return size;
613 case GLSL_TYPE_SUBROUTINE:
614 return 1;
615
616 case GLSL_TYPE_SAMPLER:
617 /* Samplers take up no register space, since they're baked in at
618 * link time.
619 */
620 return 0;
621 case GLSL_TYPE_ATOMIC_UINT:
622 return 0;
623 case GLSL_TYPE_IMAGE:
624 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
625 case GLSL_TYPE_VOID:
626 case GLSL_TYPE_ERROR:
627 case GLSL_TYPE_INTERFACE:
628 case GLSL_TYPE_FUNCTION:
629 unreachable("not reached");
630 }
631
632 return 0;
633 }
634
635 /**
636 * Returns the minimum number of vec4 elements needed to pack a type.
637 *
638 * For simple types, it will return 1 (a single vec4); for matrices, the
639 * number of columns; for array and struct, the sum of the vec4_size of
640 * each of its elements; and for sampler and atomic, zero.
641 *
642 * This method is useful to calculate how much register space is needed to
643 * store a particular type.
644 */
645 extern "C" int
646 type_size_vec4(const struct glsl_type *type)
647 {
648 return type_size_xvec4(type, true);
649 }
650
651 /**
652 * Returns the minimum number of dvec4 elements needed to pack a type.
653 *
654 * For simple types, it will return 1 (a single dvec4); for matrices, the
655 * number of columns; for array and struct, the sum of the dvec4_size of
656 * each of its elements; and for sampler and atomic, zero.
657 *
658 * This method is useful to calculate how much register space is needed to
659 * store a particular type.
660 *
661 * Measuring double-precision vertex inputs as dvec4 is required because
662 * ARB_vertex_attrib_64bit states that these uses the same number of locations
663 * than the single-precision version. That is, two consecutives dvec4 would be
664 * located in location "x" and location "x+1", not "x+2".
665 *
666 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
667 * remap_vs_attrs() will take in account both the location and also if the
668 * type fits in one or two vec4 slots.
669 */
670 extern "C" int
671 type_size_dvec4(const struct glsl_type *type)
672 {
673 return type_size_xvec4(type, false);
674 }
675
676 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
677 {
678 init();
679
680 this->file = VGRF;
681 this->nr = v->alloc.allocate(type_size_vec4(type));
682
683 if (type->is_array() || type->is_record()) {
684 this->swizzle = BRW_SWIZZLE_NOOP;
685 } else {
686 this->swizzle = brw_swizzle_for_size(type->vector_elements);
687 }
688
689 this->type = brw_type_for_base_type(type);
690 }
691
692 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
693 {
694 assert(size > 0);
695
696 init();
697
698 this->file = VGRF;
699 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
700
701 this->swizzle = BRW_SWIZZLE_NOOP;
702
703 this->type = brw_type_for_base_type(type);
704 }
705
706 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
707 {
708 init();
709
710 this->file = VGRF;
711 this->nr = v->alloc.allocate(type_size_vec4(type));
712
713 if (type->is_array() || type->is_record()) {
714 this->writemask = WRITEMASK_XYZW;
715 } else {
716 this->writemask = (1 << type->vector_elements) - 1;
717 }
718
719 this->type = brw_type_for_base_type(type);
720 }
721
722 vec4_instruction *
723 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
724 src_reg src0, src_reg src1)
725 {
726 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
727 inst->conditional_mod = conditionalmod;
728 return inst;
729 }
730
731 vec4_instruction *
732 vec4_visitor::emit_lrp(const dst_reg &dst,
733 const src_reg &x, const src_reg &y, const src_reg &a)
734 {
735 if (devinfo->gen >= 6) {
736 /* Note that the instruction's argument order is reversed from GLSL
737 * and the IR.
738 */
739 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
740 fix_3src_operand(x)));
741 } else {
742 /* Earlier generations don't support three source operations, so we
743 * need to emit x*(1-a) + y*a.
744 */
745 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
746 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
747 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
748 y_times_a.writemask = dst.writemask;
749 one_minus_a.writemask = dst.writemask;
750 x_times_one_minus_a.writemask = dst.writemask;
751
752 emit(MUL(y_times_a, y, a));
753 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
754 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
755 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
756 }
757 }
758
759 /**
760 * Emits the instructions needed to perform a pull constant load. before_block
761 * and before_inst can be NULL in which case the instruction will be appended
762 * to the end of the instruction list.
763 */
764 void
765 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
766 src_reg surf_index,
767 src_reg offset_reg,
768 bblock_t *before_block,
769 vec4_instruction *before_inst)
770 {
771 assert((before_inst == NULL && before_block == NULL) ||
772 (before_inst && before_block));
773
774 vec4_instruction *pull;
775
776 if (devinfo->gen >= 9) {
777 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
778 src_reg header(this, glsl_type::uvec4_type, 2);
779
780 pull = new(mem_ctx)
781 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
782 dst_reg(header));
783
784 if (before_inst)
785 emit_before(before_block, before_inst, pull);
786 else
787 emit(pull);
788
789 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
790 offset_reg.type);
791 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
792
793 if (before_inst)
794 emit_before(before_block, before_inst, pull);
795 else
796 emit(pull);
797
798 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
799 dst,
800 surf_index,
801 header);
802 pull->mlen = 2;
803 pull->header_size = 1;
804 } else if (devinfo->gen >= 7) {
805 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
806
807 grf_offset.type = offset_reg.type;
808
809 pull = MOV(grf_offset, offset_reg);
810
811 if (before_inst)
812 emit_before(before_block, before_inst, pull);
813 else
814 emit(pull);
815
816 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
817 dst,
818 surf_index,
819 src_reg(grf_offset));
820 pull->mlen = 1;
821 } else {
822 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
823 dst,
824 surf_index,
825 offset_reg);
826 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
827 pull->mlen = 1;
828 }
829
830 if (before_inst)
831 emit_before(before_block, before_inst, pull);
832 else
833 emit(pull);
834 }
835
836 src_reg
837 vec4_visitor::emit_uniformize(const src_reg &src)
838 {
839 const src_reg chan_index(this, glsl_type::uint_type);
840 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
841 src.type);
842
843 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
844 ->force_writemask_all = true;
845 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
846 ->force_writemask_all = true;
847
848 return src_reg(dst);
849 }
850
851 src_reg
852 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
853 src_reg coordinate, src_reg surface)
854 {
855 vec4_instruction *inst =
856 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
857 dst_reg(this, glsl_type::uvec4_type));
858 inst->base_mrf = 2;
859 inst->src[1] = surface;
860 inst->src[2] = surface;
861
862 int param_base;
863
864 if (devinfo->gen >= 9) {
865 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
866 vec4_instruction *header_inst = new(mem_ctx)
867 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
868 dst_reg(MRF, inst->base_mrf));
869
870 emit(header_inst);
871
872 inst->mlen = 2;
873 inst->header_size = 1;
874 param_base = inst->base_mrf + 1;
875 } else {
876 inst->mlen = 1;
877 param_base = inst->base_mrf;
878 }
879
880 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
881 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
882 int zero_mask = 0xf & ~coord_mask;
883
884 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
885 coordinate));
886
887 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
888 brw_imm_d(0)));
889
890 emit(inst);
891 return src_reg(inst->dst);
892 }
893
894 bool
895 vec4_visitor::is_high_sampler(src_reg sampler)
896 {
897 if (devinfo->gen < 8 && !devinfo->is_haswell)
898 return false;
899
900 return sampler.file != IMM || sampler.ud >= 16;
901 }
902
903 void
904 vec4_visitor::emit_texture(ir_texture_opcode op,
905 dst_reg dest,
906 const glsl_type *dest_type,
907 src_reg coordinate,
908 int coord_components,
909 src_reg shadow_comparator,
910 src_reg lod, src_reg lod2,
911 src_reg sample_index,
912 uint32_t constant_offset,
913 src_reg offset_value,
914 src_reg mcs,
915 uint32_t surface,
916 src_reg surface_reg,
917 src_reg sampler_reg)
918 {
919 enum opcode opcode;
920 switch (op) {
921 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
922 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
923 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
924 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
925 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
926 SHADER_OPCODE_TXF_CMS); break;
927 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
928 case ir_tg4: opcode = offset_value.file != BAD_FILE
929 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
930 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
931 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
932 case ir_txb:
933 unreachable("TXB is not valid for vertex shaders.");
934 case ir_lod:
935 unreachable("LOD is not valid for vertex shaders.");
936 case ir_samples_identical: {
937 /* There are some challenges implementing this for vec4, and it seems
938 * unlikely to be used anyway. For now, just return false ways.
939 */
940 emit(MOV(dest, brw_imm_ud(0u)));
941 return;
942 }
943 default:
944 unreachable("Unrecognized tex op");
945 }
946
947 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
948
949 inst->offset = constant_offset;
950
951 /* The message header is necessary for:
952 * - Gen4 (always)
953 * - Gen9+ for selecting SIMD4x2
954 * - Texel offsets
955 * - Gather channel selection
956 * - Sampler indices too large to fit in a 4-bit value.
957 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
958 */
959 inst->header_size =
960 (devinfo->gen < 5 || devinfo->gen >= 9 ||
961 inst->offset != 0 || op == ir_tg4 ||
962 op == ir_texture_samples ||
963 is_high_sampler(sampler_reg)) ? 1 : 0;
964 inst->base_mrf = 2;
965 inst->mlen = inst->header_size;
966 inst->dst.writemask = WRITEMASK_XYZW;
967 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
968
969 inst->src[1] = surface_reg;
970 inst->src[2] = sampler_reg;
971
972 /* MRF for the first parameter */
973 int param_base = inst->base_mrf + inst->header_size;
974
975 if (op == ir_txs || op == ir_query_levels) {
976 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
977 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
978 inst->mlen++;
979 } else if (op == ir_texture_samples) {
980 inst->dst.writemask = WRITEMASK_X;
981 } else {
982 /* Load the coordinate */
983 /* FINISHME: gl_clamp_mask and saturate */
984 int coord_mask = (1 << coord_components) - 1;
985 int zero_mask = 0xf & ~coord_mask;
986
987 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
988 coordinate));
989 inst->mlen++;
990
991 if (zero_mask != 0) {
992 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
993 brw_imm_d(0)));
994 }
995 /* Load the shadow comparator */
996 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
997 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
998 WRITEMASK_X),
999 shadow_comparator));
1000 inst->mlen++;
1001 }
1002
1003 /* Load the LOD info */
1004 if (op == ir_tex || op == ir_txl) {
1005 int mrf, writemask;
1006 if (devinfo->gen >= 5) {
1007 mrf = param_base + 1;
1008 if (shadow_comparator.file != BAD_FILE) {
1009 writemask = WRITEMASK_Y;
1010 /* mlen already incremented */
1011 } else {
1012 writemask = WRITEMASK_X;
1013 inst->mlen++;
1014 }
1015 } else /* devinfo->gen == 4 */ {
1016 mrf = param_base;
1017 writemask = WRITEMASK_W;
1018 }
1019 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1020 } else if (op == ir_txf) {
1021 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1022 } else if (op == ir_txf_ms) {
1023 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1024 sample_index));
1025 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1026 /* MCS data is stored in the first two channels of ‘mcs’, but we
1027 * need to get it into the .y and .z channels of the second vec4
1028 * of params.
1029 */
1030 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1031 emit(MOV(dst_reg(MRF, param_base + 1,
1032 glsl_type::uint_type, WRITEMASK_YZ),
1033 mcs));
1034 } else if (devinfo->gen >= 7) {
1035 /* MCS data is in the first channel of `mcs`, but we need to get it into
1036 * the .y channel of the second vec4 of params, so replicate .x across
1037 * the whole vec4 and then mask off everything except .y
1038 */
1039 mcs.swizzle = BRW_SWIZZLE_XXXX;
1040 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1041 mcs));
1042 }
1043 inst->mlen++;
1044 } else if (op == ir_txd) {
1045 const brw_reg_type type = lod.type;
1046
1047 if (devinfo->gen >= 5) {
1048 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1049 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1050 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1051 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1052 inst->mlen++;
1053
1054 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1055 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1056 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1057 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1058 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1059 inst->mlen++;
1060
1061 if (shadow_comparator.file != BAD_FILE) {
1062 emit(MOV(dst_reg(MRF, param_base + 2,
1063 shadow_comparator.type, WRITEMASK_Z),
1064 shadow_comparator));
1065 }
1066 }
1067 } else /* devinfo->gen == 4 */ {
1068 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1069 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1070 inst->mlen += 2;
1071 }
1072 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1073 if (shadow_comparator.file != BAD_FILE) {
1074 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1075 shadow_comparator));
1076 }
1077
1078 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1079 offset_value));
1080 inst->mlen++;
1081 }
1082 }
1083
1084 emit(inst);
1085
1086 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1087 * spec requires layers.
1088 */
1089 if (op == ir_txs && devinfo->gen < 7) {
1090 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1091 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1092 src_reg(inst->dst), brw_imm_d(1));
1093 }
1094
1095 if (devinfo->gen == 6 && op == ir_tg4) {
1096 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1097 }
1098
1099 if (op == ir_query_levels) {
1100 /* # levels is in .w */
1101 src_reg swizzled(dest);
1102 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1103 SWIZZLE_W, SWIZZLE_W);
1104 emit(MOV(dest, swizzled));
1105 }
1106 }
1107
1108 /**
1109 * Apply workarounds for Gen6 gather with UINT/SINT
1110 */
1111 void
1112 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1113 {
1114 if (!wa)
1115 return;
1116
1117 int width = (wa & WA_8BIT) ? 8 : 16;
1118 dst_reg dst_f = dst;
1119 dst_f.type = BRW_REGISTER_TYPE_F;
1120
1121 /* Convert from UNORM to UINT */
1122 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1123 emit(MOV(dst, src_reg(dst_f)));
1124
1125 if (wa & WA_SIGN) {
1126 /* Reinterpret the UINT value as a signed INT value by
1127 * shifting the sign bit into place, then shifting back
1128 * preserving sign.
1129 */
1130 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1131 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1132 }
1133 }
1134
1135 void
1136 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1137 {
1138 unreachable("not reached");
1139 }
1140
1141 void
1142 vec4_visitor::gs_end_primitive()
1143 {
1144 unreachable("not reached");
1145 }
1146
1147 void
1148 vec4_visitor::emit_ndc_computation()
1149 {
1150 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1151 return;
1152
1153 /* Get the position */
1154 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1155
1156 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1157 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1158 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1159 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1160
1161 current_annotation = "NDC";
1162 dst_reg ndc_w = ndc;
1163 ndc_w.writemask = WRITEMASK_W;
1164 src_reg pos_w = pos;
1165 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1166 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1167
1168 dst_reg ndc_xyz = ndc;
1169 ndc_xyz.writemask = WRITEMASK_XYZ;
1170
1171 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1172 }
1173
1174 void
1175 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1176 {
1177 if (devinfo->gen < 6 &&
1178 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1179 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1180 devinfo->has_negative_rhw_bug)) {
1181 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1182 dst_reg header1_w = header1;
1183 header1_w.writemask = WRITEMASK_W;
1184
1185 emit(MOV(header1, brw_imm_ud(0u)));
1186
1187 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1188 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1189
1190 current_annotation = "Point size";
1191 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1192 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1193 }
1194
1195 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1196 current_annotation = "Clipping flags";
1197 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1198 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1199
1200 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1201 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1202 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1203
1204 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1205 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1206 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1207 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1208 }
1209
1210 /* i965 clipping workaround:
1211 * 1) Test for -ve rhw
1212 * 2) If set,
1213 * set ndc = (0,0,0,0)
1214 * set ucp[6] = 1
1215 *
1216 * Later, clipping will detect ucp[6] and ensure the primitive is
1217 * clipped against all fixed planes.
1218 */
1219 if (devinfo->has_negative_rhw_bug &&
1220 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1221 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1222 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1223 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1224 vec4_instruction *inst;
1225 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1226 inst->predicate = BRW_PREDICATE_NORMAL;
1227 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1228 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1229 inst->predicate = BRW_PREDICATE_NORMAL;
1230 }
1231
1232 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1233 } else if (devinfo->gen < 6) {
1234 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1235 } else {
1236 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1237 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1238 dst_reg reg_w = reg;
1239 reg_w.writemask = WRITEMASK_W;
1240 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1241 reg_as_src.type = reg_w.type;
1242 reg_as_src.swizzle = brw_swizzle_for_size(1);
1243 emit(MOV(reg_w, reg_as_src));
1244 }
1245 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1246 dst_reg reg_y = reg;
1247 reg_y.writemask = WRITEMASK_Y;
1248 reg_y.type = BRW_REGISTER_TYPE_D;
1249 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1250 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1251 }
1252 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1253 dst_reg reg_z = reg;
1254 reg_z.writemask = WRITEMASK_Z;
1255 reg_z.type = BRW_REGISTER_TYPE_D;
1256 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1257 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1258 }
1259 }
1260 }
1261
1262 vec4_instruction *
1263 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1264 {
1265 assert(varying < VARYING_SLOT_MAX);
1266
1267 unsigned num_comps = output_num_components[varying][component];
1268 if (num_comps == 0)
1269 return NULL;
1270
1271 assert(output_reg[varying][component].type == reg.type);
1272 current_annotation = output_reg_annotation[varying];
1273 if (output_reg[varying][component].file != BAD_FILE) {
1274 src_reg src = src_reg(output_reg[varying][component]);
1275 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1276 reg.writemask =
1277 brw_writemask_for_component_packing(num_comps, component);
1278 return emit(MOV(reg, src));
1279 }
1280 return NULL;
1281 }
1282
1283 void
1284 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1285 {
1286 reg.type = BRW_REGISTER_TYPE_F;
1287 output_reg[varying][0].type = reg.type;
1288
1289 switch (varying) {
1290 case VARYING_SLOT_PSIZ:
1291 {
1292 /* PSIZ is always in slot 0, and is coupled with other flags. */
1293 current_annotation = "indices, point width, clip flags";
1294 emit_psiz_and_flags(reg);
1295 break;
1296 }
1297 case BRW_VARYING_SLOT_NDC:
1298 current_annotation = "NDC";
1299 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1300 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1301 break;
1302 case VARYING_SLOT_POS:
1303 current_annotation = "gl_Position";
1304 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1305 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1306 break;
1307 case VARYING_SLOT_EDGE: {
1308 /* This is present when doing unfilled polygons. We're supposed to copy
1309 * the edge flag from the user-provided vertex array
1310 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1311 * of that attribute (starts as 1.0f). This is then used in clipping to
1312 * determine which edges should be drawn as wireframe.
1313 */
1314 current_annotation = "edge flag";
1315 int edge_attr = _mesa_bitcount_64(nir->info.inputs_read &
1316 BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1317 emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1318 glsl_type::float_type, WRITEMASK_XYZW))));
1319 break;
1320 }
1321 case BRW_VARYING_SLOT_PAD:
1322 /* No need to write to this slot */
1323 break;
1324 default:
1325 for (int i = 0; i < 4; i++) {
1326 emit_generic_urb_slot(reg, varying, i);
1327 }
1328 break;
1329 }
1330 }
1331
1332 static int
1333 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1334 {
1335 if (devinfo->gen >= 6) {
1336 /* URB data written (does not include the message header reg) must
1337 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1338 * section 5.4.3.2.2: URB_INTERLEAVED.
1339 *
1340 * URB entries are allocated on a multiple of 1024 bits, so an
1341 * extra 128 bits written here to make the end align to 256 is
1342 * no problem.
1343 */
1344 if ((mlen % 2) != 1)
1345 mlen++;
1346 }
1347
1348 return mlen;
1349 }
1350
1351
1352 /**
1353 * Generates the VUE payload plus the necessary URB write instructions to
1354 * output it.
1355 *
1356 * The VUE layout is documented in Volume 2a.
1357 */
1358 void
1359 vec4_visitor::emit_vertex()
1360 {
1361 /* MRF 0 is reserved for the debugger, so start with message header
1362 * in MRF 1.
1363 */
1364 int base_mrf = 1;
1365 int mrf = base_mrf;
1366 /* In the process of generating our URB write message contents, we
1367 * may need to unspill a register or load from an array. Those
1368 * reads would use MRFs 14-15.
1369 */
1370 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1371
1372 /* The following assertion verifies that max_usable_mrf causes an
1373 * even-numbered amount of URB write data, which will meet gen6's
1374 * requirements for length alignment.
1375 */
1376 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1377
1378 /* First mrf is the g0-based message header containing URB handles and
1379 * such.
1380 */
1381 emit_urb_write_header(mrf++);
1382
1383 if (devinfo->gen < 6) {
1384 emit_ndc_computation();
1385 }
1386
1387 /* We may need to split this up into several URB writes, so do them in a
1388 * loop.
1389 */
1390 int slot = 0;
1391 bool complete = false;
1392 do {
1393 /* URB offset is in URB row increments, and each of our MRFs is half of
1394 * one of those, since we're doing interleaved writes.
1395 */
1396 int offset = slot / 2;
1397
1398 mrf = base_mrf + 1;
1399 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1400 emit_urb_slot(dst_reg(MRF, mrf++),
1401 prog_data->vue_map.slot_to_varying[slot]);
1402
1403 /* If this was max_usable_mrf, we can't fit anything more into this
1404 * URB WRITE. Same thing if we reached the maximum length available.
1405 */
1406 if (mrf > max_usable_mrf ||
1407 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1408 slot++;
1409 break;
1410 }
1411 }
1412
1413 complete = slot >= prog_data->vue_map.num_slots;
1414 current_annotation = "URB write";
1415 vec4_instruction *inst = emit_urb_write_opcode(complete);
1416 inst->base_mrf = base_mrf;
1417 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1418 inst->offset += offset;
1419 } while(!complete);
1420 }
1421
1422
1423 src_reg
1424 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1425 src_reg *reladdr, int reg_offset)
1426 {
1427 /* Because we store the values to scratch interleaved like our
1428 * vertex data, we need to scale the vec4 index by 2.
1429 */
1430 int message_header_scale = 2;
1431
1432 /* Pre-gen6, the message header uses byte offsets instead of vec4
1433 * (16-byte) offset units.
1434 */
1435 if (devinfo->gen < 6)
1436 message_header_scale *= 16;
1437
1438 if (reladdr) {
1439 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1440 * to multiply the reladdr by 2. Notice that the reg_offset part
1441 * is in units of 16 bytes and is used to select the low/high 16-byte
1442 * chunk of a full dvec4, so we don't want to multiply that part.
1443 */
1444 src_reg index = src_reg(this, glsl_type::int_type);
1445 if (type_sz(inst->dst.type) < 8) {
1446 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1447 brw_imm_d(reg_offset)));
1448 emit_before(block, inst, MUL(dst_reg(index), index,
1449 brw_imm_d(message_header_scale)));
1450 } else {
1451 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1452 brw_imm_d(message_header_scale * 2)));
1453 emit_before(block, inst, ADD(dst_reg(index), index,
1454 brw_imm_d(reg_offset * message_header_scale)));
1455 }
1456 return index;
1457 } else {
1458 return brw_imm_d(reg_offset * message_header_scale);
1459 }
1460 }
1461
1462 /**
1463 * Emits an instruction before @inst to load the value named by @orig_src
1464 * from scratch space at @base_offset to @temp.
1465 *
1466 * @base_offset is measured in 32-byte units (the size of a register).
1467 */
1468 void
1469 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1470 dst_reg temp, src_reg orig_src,
1471 int base_offset)
1472 {
1473 assert(orig_src.offset % REG_SIZE == 0);
1474 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1475 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1476 reg_offset);
1477
1478 if (type_sz(orig_src.type) < 8) {
1479 emit_before(block, inst, SCRATCH_READ(temp, index));
1480 } else {
1481 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1482 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1483 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1484 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1485 vec4_instruction *last_read =
1486 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1487 emit_before(block, inst, last_read);
1488 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1489 }
1490 }
1491
1492 /**
1493 * Emits an instruction after @inst to store the value to be written
1494 * to @orig_dst to scratch space at @base_offset, from @temp.
1495 *
1496 * @base_offset is measured in 32-byte units (the size of a register).
1497 */
1498 void
1499 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1500 int base_offset)
1501 {
1502 assert(inst->dst.offset % REG_SIZE == 0);
1503 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1504 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1505 reg_offset);
1506
1507 /* Create a temporary register to store *inst's result in.
1508 *
1509 * We have to be careful in MOVing from our temporary result register in
1510 * the scratch write. If we swizzle from channels of the temporary that
1511 * weren't initialized, it will confuse live interval analysis, which will
1512 * make spilling fail to make progress.
1513 */
1514 bool is_64bit = type_sz(inst->dst.type) == 8;
1515 const glsl_type *alloc_type =
1516 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1517 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1518 inst->dst.type),
1519 brw_swizzle_for_mask(inst->dst.writemask));
1520
1521 if (!is_64bit) {
1522 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1523 inst->dst.writemask));
1524 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1525 if (inst->opcode != BRW_OPCODE_SEL)
1526 write->predicate = inst->predicate;
1527 write->ir = inst->ir;
1528 write->annotation = inst->annotation;
1529 inst->insert_after(block, write);
1530 } else {
1531 dst_reg shuffled = dst_reg(this, alloc_type);
1532 vec4_instruction *last =
1533 shuffle_64bit_data(shuffled, temp, true, block, inst);
1534 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1535
1536 uint8_t mask = 0;
1537 if (inst->dst.writemask & WRITEMASK_X)
1538 mask |= WRITEMASK_XY;
1539 if (inst->dst.writemask & WRITEMASK_Y)
1540 mask |= WRITEMASK_ZW;
1541 if (mask) {
1542 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1543
1544 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1545 if (inst->opcode != BRW_OPCODE_SEL)
1546 write->predicate = inst->predicate;
1547 write->ir = inst->ir;
1548 write->annotation = inst->annotation;
1549 last->insert_after(block, write);
1550 }
1551
1552 mask = 0;
1553 if (inst->dst.writemask & WRITEMASK_Z)
1554 mask |= WRITEMASK_XY;
1555 if (inst->dst.writemask & WRITEMASK_W)
1556 mask |= WRITEMASK_ZW;
1557 if (mask) {
1558 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1559
1560 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1561 reg_offset + 1);
1562 vec4_instruction *write =
1563 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1564 if (inst->opcode != BRW_OPCODE_SEL)
1565 write->predicate = inst->predicate;
1566 write->ir = inst->ir;
1567 write->annotation = inst->annotation;
1568 last->insert_after(block, write);
1569 }
1570 }
1571
1572 inst->dst.file = temp.file;
1573 inst->dst.nr = temp.nr;
1574 inst->dst.offset %= REG_SIZE;
1575 inst->dst.reladdr = NULL;
1576 }
1577
1578 /**
1579 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1580 * adds the scratch read(s) before \p inst. The function also checks for
1581 * recursive reladdr scratch accesses, issuing the corresponding scratch
1582 * loads and rewriting reladdr references accordingly.
1583 *
1584 * \return \p src if it did not require a scratch load, otherwise, the
1585 * register holding the result of the scratch load that the caller should
1586 * use to rewrite src.
1587 */
1588 src_reg
1589 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1590 vec4_instruction *inst, src_reg src)
1591 {
1592 /* Resolve recursive reladdr scratch access by calling ourselves
1593 * with src.reladdr
1594 */
1595 if (src.reladdr)
1596 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1597 *src.reladdr);
1598
1599 /* Now handle scratch access on src */
1600 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1601 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1602 glsl_type::dvec4_type : glsl_type::vec4_type);
1603 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1604 src.nr = temp.nr;
1605 src.offset %= REG_SIZE;
1606 src.reladdr = NULL;
1607 }
1608
1609 return src;
1610 }
1611
1612 /**
1613 * We can't generally support array access in GRF space, because a
1614 * single instruction's destination can only span 2 contiguous
1615 * registers. So, we send all GRF arrays that get variable index
1616 * access to scratch space.
1617 */
1618 void
1619 vec4_visitor::move_grf_array_access_to_scratch()
1620 {
1621 int scratch_loc[this->alloc.count];
1622 memset(scratch_loc, -1, sizeof(scratch_loc));
1623
1624 /* First, calculate the set of virtual GRFs that need to be punted
1625 * to scratch due to having any array access on them, and where in
1626 * scratch.
1627 */
1628 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1629 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1630 if (scratch_loc[inst->dst.nr] == -1) {
1631 scratch_loc[inst->dst.nr] = last_scratch;
1632 last_scratch += this->alloc.sizes[inst->dst.nr];
1633 }
1634
1635 for (src_reg *iter = inst->dst.reladdr;
1636 iter->reladdr;
1637 iter = iter->reladdr) {
1638 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1639 scratch_loc[iter->nr] = last_scratch;
1640 last_scratch += this->alloc.sizes[iter->nr];
1641 }
1642 }
1643 }
1644
1645 for (int i = 0 ; i < 3; i++) {
1646 for (src_reg *iter = &inst->src[i];
1647 iter->reladdr;
1648 iter = iter->reladdr) {
1649 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1650 scratch_loc[iter->nr] = last_scratch;
1651 last_scratch += this->alloc.sizes[iter->nr];
1652 }
1653 }
1654 }
1655 }
1656
1657 /* Now, for anything that will be accessed through scratch, rewrite
1658 * it to load/store. Note that this is a _safe list walk, because
1659 * we may generate a new scratch_write instruction after the one
1660 * we're processing.
1661 */
1662 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1663 /* Set up the annotation tracking for new generated instructions. */
1664 base_ir = inst->ir;
1665 current_annotation = inst->annotation;
1666
1667 /* First handle scratch access on the dst. Notice we have to handle
1668 * the case where the dst's reladdr also points to scratch space.
1669 */
1670 if (inst->dst.reladdr)
1671 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1672 *inst->dst.reladdr);
1673
1674 /* Now that we have handled any (possibly recursive) reladdr scratch
1675 * accesses for dst we can safely do the scratch write for dst itself
1676 */
1677 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1678 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1679
1680 /* Now handle scratch access on any src. In this case, since inst->src[i]
1681 * already is a src_reg, we can just call emit_resolve_reladdr with
1682 * inst->src[i] and it will take care of handling scratch loads for
1683 * both src and src.reladdr (recursively).
1684 */
1685 for (int i = 0 ; i < 3; i++) {
1686 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1687 inst->src[i]);
1688 }
1689 }
1690 }
1691
1692 /**
1693 * Emits an instruction before @inst to load the value named by @orig_src
1694 * from the pull constant buffer (surface) at @base_offset to @temp.
1695 */
1696 void
1697 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1698 dst_reg temp, src_reg orig_src,
1699 int base_offset, src_reg indirect)
1700 {
1701 assert(orig_src.offset % 16 == 0);
1702 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1703
1704 /* For 64bit loads we need to emit two 32-bit load messages and we also
1705 * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1706 * that we emit the 32-bit loads into a temporary and we shuffle the result
1707 * into the original destination.
1708 */
1709 dst_reg orig_temp = temp;
1710 bool is_64bit = type_sz(orig_src.type) == 8;
1711 if (is_64bit) {
1712 assert(type_sz(temp.type) == 8);
1713 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1714 temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1715 }
1716
1717 src_reg src = orig_src;
1718 for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1719 int reg_offset = base_offset + src.offset / 16;
1720
1721 src_reg offset;
1722 if (indirect.file != BAD_FILE) {
1723 offset = src_reg(this, glsl_type::uint_type);
1724 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1725 brw_imm_ud(reg_offset * 16)));
1726 } else if (devinfo->gen >= 8) {
1727 /* Store the offset in a GRF so we can send-from-GRF. */
1728 offset = src_reg(this, glsl_type::uint_type);
1729 emit_before(block, inst, MOV(dst_reg(offset),
1730 brw_imm_ud(reg_offset * 16)));
1731 } else {
1732 offset = brw_imm_d(reg_offset * 16);
1733 }
1734
1735 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1736 brw_imm_ud(index),
1737 offset,
1738 block, inst);
1739
1740 src = byte_offset(src, 16);
1741 }
1742
1743 brw_mark_surface_used(&prog_data->base, index);
1744
1745 if (is_64bit) {
1746 temp = retype(temp, BRW_REGISTER_TYPE_DF);
1747 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1748 }
1749 }
1750
1751 /**
1752 * Implements array access of uniforms by inserting a
1753 * PULL_CONSTANT_LOAD instruction.
1754 *
1755 * Unlike temporary GRF array access (where we don't support it due to
1756 * the difficulty of doing relative addressing on instruction
1757 * destinations), we could potentially do array access of uniforms
1758 * that were loaded in GRF space as push constants. In real-world
1759 * usage we've seen, though, the arrays being used are always larger
1760 * than we could load as push constants, so just always move all
1761 * uniform array access out to a pull constant buffer.
1762 */
1763 void
1764 vec4_visitor::move_uniform_array_access_to_pull_constants()
1765 {
1766 /* The vulkan dirver doesn't support pull constants other than UBOs so
1767 * everything has to be pushed regardless.
1768 */
1769 if (!compiler->supports_pull_constants) {
1770 split_uniform_registers();
1771 return;
1772 }
1773
1774 /* Allocate the pull_params array */
1775 assert(stage_prog_data->nr_pull_params == 0);
1776 stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1777 this->uniforms * 4);
1778
1779 int pull_constant_loc[this->uniforms];
1780 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1781
1782 /* First, walk through the instructions and determine which things need to
1783 * be pulled. We mark something as needing to be pulled by setting
1784 * pull_constant_loc to 0.
1785 */
1786 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1787 /* We only care about MOV_INDIRECT of a uniform */
1788 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1789 inst->src[0].file != UNIFORM)
1790 continue;
1791
1792 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1793
1794 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1795 pull_constant_loc[uniform_nr + j] = 0;
1796 }
1797
1798 /* Next, we walk the list of uniforms and assign real pull constant
1799 * locations and set their corresponding entries in pull_param.
1800 */
1801 for (int j = 0; j < this->uniforms; j++) {
1802 if (pull_constant_loc[j] < 0)
1803 continue;
1804
1805 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1806
1807 for (int i = 0; i < 4; i++) {
1808 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1809 = stage_prog_data->param[j * 4 + i];
1810 }
1811 }
1812
1813 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1814 * instructions to actual uniform pulls.
1815 */
1816 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1817 /* We only care about MOV_INDIRECT of a uniform */
1818 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1819 inst->src[0].file != UNIFORM)
1820 continue;
1821
1822 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1823
1824 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1825
1826 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1827 pull_constant_loc[uniform_nr], inst->src[1]);
1828 inst->remove(block);
1829 }
1830
1831 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1832 * no need to track them as larger-than-vec4 objects. This will be
1833 * relied on in cutting out unused uniform vectors from push
1834 * constants.
1835 */
1836 split_uniform_registers();
1837 }
1838
1839 void
1840 vec4_visitor::resolve_ud_negate(src_reg *reg)
1841 {
1842 if (reg->type != BRW_REGISTER_TYPE_UD ||
1843 !reg->negate)
1844 return;
1845
1846 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1847 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1848 *reg = temp;
1849 }
1850
1851 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1852 void *log_data,
1853 const struct brw_sampler_prog_key_data *key_tex,
1854 struct brw_vue_prog_data *prog_data,
1855 const nir_shader *shader,
1856 void *mem_ctx,
1857 bool no_spills,
1858 int shader_time_index)
1859 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1860 key_tex(key_tex),
1861 prog_data(prog_data),
1862 fail_msg(NULL),
1863 first_non_payload_grf(0),
1864 need_all_constants_in_pull_buffer(false),
1865 no_spills(no_spills),
1866 shader_time_index(shader_time_index),
1867 last_scratch(0)
1868 {
1869 this->failed = false;
1870
1871 this->base_ir = NULL;
1872 this->current_annotation = NULL;
1873 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1874
1875 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1876
1877 this->virtual_grf_start = NULL;
1878 this->virtual_grf_end = NULL;
1879 this->live_intervals = NULL;
1880
1881 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1882
1883 this->uniforms = 0;
1884 }
1885
1886
1887 void
1888 vec4_visitor::fail(const char *format, ...)
1889 {
1890 va_list va;
1891 char *msg;
1892
1893 if (failed)
1894 return;
1895
1896 failed = true;
1897
1898 va_start(va, format);
1899 msg = ralloc_vasprintf(mem_ctx, format, va);
1900 va_end(va);
1901 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1902
1903 this->fail_msg = msg;
1904
1905 if (debug_enabled) {
1906 fprintf(stderr, "%s", msg);
1907 }
1908 }
1909
1910 } /* namespace brw */