intel/compiler: Allow MESA_SHADER_KERNEL
[mesa.git] / src / intel / compiler / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "util/u_math.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->eot = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->exec_size = 8;
59 this->group = 0;
60 this->size_written = (dst.file == BAD_FILE ?
61 0 : this->exec_size * type_sz(dst.type));
62 this->annotation = NULL;
63 }
64
65 vec4_instruction *
66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68 inst->ir = this->base_ir;
69 inst->annotation = this->current_annotation;
70
71 this->instructions.push_tail(inst);
72
73 return inst;
74 }
75
76 vec4_instruction *
77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78 vec4_instruction *new_inst)
79 {
80 new_inst->ir = inst->ir;
81 new_inst->annotation = inst->annotation;
82
83 inst->insert_before(block, new_inst);
84
85 return inst;
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90 const src_reg &src1, const src_reg &src2)
91 {
92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98 const src_reg &src1)
99 {
100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102
103 vec4_instruction *
104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108
109 vec4_instruction *
110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111 {
112 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114
115 vec4_instruction *
116 vec4_visitor::emit(enum opcode opcode)
117 {
118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120
121 #define ALU1(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
124 { \
125 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126 }
127
128 #define ALU2(op) \
129 vec4_instruction * \
130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
131 const src_reg &src1) \
132 { \
133 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
134 src0, src1); \
135 }
136
137 #define ALU2_ACC(op) \
138 vec4_instruction * \
139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
140 const src_reg &src1) \
141 { \
142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
143 BRW_OPCODE_##op, dst, src0, src1); \
144 inst->writes_accumulator = true; \
145 return inst; \
146 }
147
148 #define ALU3(op) \
149 vec4_instruction * \
150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
151 const src_reg &src1, const src_reg &src2) \
152 { \
153 assert(devinfo->gen >= 6); \
154 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
155 src0, src1, src2); \
156 }
157
158 ALU1(NOT)
159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU3(MAD)
187 ALU2_ACC(ADDC)
188 ALU2_ACC(SUBB)
189 ALU2(MAC)
190 ALU1(DIM)
191
192 /** Gen4 predicated IF. */
193 vec4_instruction *
194 vec4_visitor::IF(enum brw_predicate predicate)
195 {
196 vec4_instruction *inst;
197
198 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199 inst->predicate = predicate;
200
201 return inst;
202 }
203
204 /** Gen6 IF with embedded comparison. */
205 vec4_instruction *
206 vec4_visitor::IF(src_reg src0, src_reg src1,
207 enum brw_conditional_mod condition)
208 {
209 assert(devinfo->gen == 6);
210
211 vec4_instruction *inst;
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217 src0, src1);
218 inst->conditional_mod = condition;
219
220 return inst;
221 }
222
223 /**
224 * CMP: Sets the low bit of the destination channels with the result
225 * of the comparison, while the upper bits are undefined, and updates
226 * the flag register with the packed 16 bits of the result.
227 */
228 vec4_instruction *
229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230 enum brw_conditional_mod condition)
231 {
232 vec4_instruction *inst;
233
234 /* Take the instruction:
235 *
236 * CMP null<d> src0<f> src1<f>
237 *
238 * Original gen4 does type conversion to the destination type before
239 * comparison, producing garbage results for floating point comparisons.
240 *
241 * The destination type doesn't matter on newer generations, so we set the
242 * type to match src0 so we can compact the instruction.
243 */
244 dst.type = src0.type;
245
246 resolve_ud_negate(&src0);
247 resolve_ud_negate(&src1);
248
249 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250 inst->conditional_mod = condition;
251
252 return inst;
253 }
254
255 vec4_instruction *
256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257 {
258 vec4_instruction *inst;
259
260 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
261 dst, index);
262 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
263 inst->mlen = 2;
264
265 return inst;
266 }
267
268 vec4_instruction *
269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270 const src_reg &index)
271 {
272 vec4_instruction *inst;
273
274 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
275 dst, src, index);
276 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
277 inst->mlen = 3;
278
279 return inst;
280 }
281
282 src_reg
283 vec4_visitor::fix_3src_operand(const src_reg &src)
284 {
285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286 * able to use vertical stride of zero to replicate the vec4 uniform, like
287 *
288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289 *
290 * But you can't, since vertical stride is always four in three-source
291 * instructions. Instead, insert a MOV instruction to do the replication so
292 * that the three-source instruction can consume it.
293 */
294
295 /* The MOV is only needed if the source is a uniform or immediate. */
296 if (src.file != UNIFORM && src.file != IMM)
297 return src;
298
299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305 return src_reg(expanded);
306 }
307
308 src_reg
309 vec4_visitor::resolve_source_modifiers(const src_reg &src)
310 {
311 if (!src.abs && !src.negate)
312 return src;
313
314 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
315 resolved.type = src.type;
316 emit(MOV(resolved, src));
317
318 return src_reg(resolved);
319 }
320
321 src_reg
322 vec4_visitor::fix_math_operand(const src_reg &src)
323 {
324 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
325 return src;
326
327 /* The gen6 math instruction ignores the source modifiers --
328 * swizzle, abs, negate, and at least some parts of the register
329 * region description.
330 *
331 * Rather than trying to enumerate all these cases, *always* expand the
332 * operand to a temp GRF for gen6.
333 *
334 * For gen7, keep the operand as-is, except if immediate, which gen7 still
335 * can't use.
336 */
337
338 if (devinfo->gen == 7 && src.file != IMM)
339 return src;
340
341 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
342 expanded.type = src.type;
343 emit(MOV(expanded, src));
344 return src_reg(expanded);
345 }
346
347 vec4_instruction *
348 vec4_visitor::emit_math(enum opcode opcode,
349 const dst_reg &dst,
350 const src_reg &src0, const src_reg &src1)
351 {
352 vec4_instruction *math =
353 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
354
355 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
356 /* MATH on Gen6 must be align1, so we can't do writemasks. */
357 math->dst = dst_reg(this, glsl_type::vec4_type);
358 math->dst.type = dst.type;
359 math = emit(MOV(dst, src_reg(math->dst)));
360 } else if (devinfo->gen < 6) {
361 math->base_mrf = 1;
362 math->mlen = src1.file == BAD_FILE ? 1 : 2;
363 }
364
365 return math;
366 }
367
368 void
369 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
370 {
371 if (devinfo->gen < 7) {
372 unreachable("ir_unop_pack_half_2x16 should be lowered");
373 }
374
375 assert(dst.type == BRW_REGISTER_TYPE_UD);
376 assert(src0.type == BRW_REGISTER_TYPE_F);
377
378 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
379 *
380 * Because this instruction does not have a 16-bit floating-point type,
381 * the destination data type must be Word (W).
382 *
383 * The destination must be DWord-aligned and specify a horizontal stride
384 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
385 * each destination channel and the upper word is not modified.
386 *
387 * The above restriction implies that the f32to16 instruction must use
388 * align1 mode, because only in align1 mode is it possible to specify
389 * horizontal stride. We choose here to defy the hardware docs and emit
390 * align16 instructions.
391 *
392 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
393 * instructions. I was partially successful in that the code passed all
394 * tests. However, the code was dubiously correct and fragile, and the
395 * tests were not harsh enough to probe that frailty. Not trusting the
396 * code, I chose instead to remain in align16 mode in defiance of the hw
397 * docs).
398 *
399 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
400 * simulator, emitting a f32to16 in align16 mode with UD as destination
401 * data type is safe. The behavior differs from that specified in the PRM
402 * in that the upper word of each destination channel is cleared to 0.
403 */
404
405 dst_reg tmp_dst(this, glsl_type::uvec2_type);
406 src_reg tmp_src(tmp_dst);
407
408 #if 0
409 /* Verify the undocumented behavior on which the following instructions
410 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
411 * then the result of the bit-or instruction below will be incorrect.
412 *
413 * You should inspect the disasm output in order to verify that the MOV is
414 * not optimized away.
415 */
416 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
417 #endif
418
419 /* Give tmp the form below, where "." means untouched.
420 *
421 * w z y x w z y x
422 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
423 *
424 * That the upper word of each write-channel be 0 is required for the
425 * following bit-shift and bit-or instructions to work. Note that this
426 * relies on the undocumented hardware behavior mentioned above.
427 */
428 tmp_dst.writemask = WRITEMASK_XY;
429 emit(F32TO16(tmp_dst, src0));
430
431 /* Give the write-channels of dst the form:
432 * 0xhhhh0000
433 */
434 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
435 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
436
437 /* Finally, give the write-channels of dst the form of packHalf2x16's
438 * output:
439 * 0xhhhhllll
440 */
441 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
442 emit(OR(dst, src_reg(dst), tmp_src));
443 }
444
445 void
446 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
447 {
448 if (devinfo->gen < 7) {
449 unreachable("ir_unop_unpack_half_2x16 should be lowered");
450 }
451
452 assert(dst.type == BRW_REGISTER_TYPE_F);
453 assert(src0.type == BRW_REGISTER_TYPE_UD);
454
455 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
456 *
457 * Because this instruction does not have a 16-bit floating-point type,
458 * the source data type must be Word (W). The destination type must be
459 * F (Float).
460 *
461 * To use W as the source data type, we must adjust horizontal strides,
462 * which is only possible in align1 mode. All my [chadv] attempts at
463 * emitting align1 instructions for unpackHalf2x16 failed to pass the
464 * Piglit tests, so I gave up.
465 *
466 * I've verified that, on gen7 hardware and the simulator, it is safe to
467 * emit f16to32 in align16 mode with UD as source data type.
468 */
469
470 dst_reg tmp_dst(this, glsl_type::uvec2_type);
471 src_reg tmp_src(tmp_dst);
472
473 tmp_dst.writemask = WRITEMASK_X;
474 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
475
476 tmp_dst.writemask = WRITEMASK_Y;
477 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
478
479 dst.writemask = WRITEMASK_XY;
480 emit(F16TO32(dst, tmp_src));
481 }
482
483 void
484 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
485 {
486 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
487 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
488 * is not suitable to generate the shift values, but we can use the packed
489 * vector float and a type-converting MOV.
490 */
491 dst_reg shift(this, glsl_type::uvec4_type);
492 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
493
494 dst_reg shifted(this, glsl_type::uvec4_type);
495 src0.swizzle = BRW_SWIZZLE_XXXX;
496 emit(SHR(shifted, src0, src_reg(shift)));
497
498 shifted.type = BRW_REGISTER_TYPE_UB;
499 dst_reg f(this, glsl_type::vec4_type);
500 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
501
502 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
503 }
504
505 void
506 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
507 {
508 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
509 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
510 * is not suitable to generate the shift values, but we can use the packed
511 * vector float and a type-converting MOV.
512 */
513 dst_reg shift(this, glsl_type::uvec4_type);
514 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
515
516 dst_reg shifted(this, glsl_type::uvec4_type);
517 src0.swizzle = BRW_SWIZZLE_XXXX;
518 emit(SHR(shifted, src0, src_reg(shift)));
519
520 shifted.type = BRW_REGISTER_TYPE_B;
521 dst_reg f(this, glsl_type::vec4_type);
522 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
523
524 dst_reg scaled(this, glsl_type::vec4_type);
525 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
526
527 dst_reg max(this, glsl_type::vec4_type);
528 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
529 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
530 }
531
532 void
533 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
534 {
535 dst_reg saturated(this, glsl_type::vec4_type);
536 vec4_instruction *inst = emit(MOV(saturated, src0));
537 inst->saturate = true;
538
539 dst_reg scaled(this, glsl_type::vec4_type);
540 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
541
542 dst_reg rounded(this, glsl_type::vec4_type);
543 emit(RNDE(rounded, src_reg(scaled)));
544
545 dst_reg u(this, glsl_type::uvec4_type);
546 emit(MOV(u, src_reg(rounded)));
547
548 src_reg bytes(u);
549 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
550 }
551
552 void
553 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
554 {
555 dst_reg max(this, glsl_type::vec4_type);
556 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
557
558 dst_reg min(this, glsl_type::vec4_type);
559 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
560
561 dst_reg scaled(this, glsl_type::vec4_type);
562 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
563
564 dst_reg rounded(this, glsl_type::vec4_type);
565 emit(RNDE(rounded, src_reg(scaled)));
566
567 dst_reg i(this, glsl_type::ivec4_type);
568 emit(MOV(i, src_reg(rounded)));
569
570 src_reg bytes(i);
571 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
572 }
573
574 /*
575 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
576 * false) elements needed to pack a type.
577 */
578 static int
579 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
580 {
581 unsigned int i;
582 int size;
583
584 switch (type->base_type) {
585 case GLSL_TYPE_UINT:
586 case GLSL_TYPE_INT:
587 case GLSL_TYPE_FLOAT:
588 case GLSL_TYPE_FLOAT16:
589 case GLSL_TYPE_BOOL:
590 case GLSL_TYPE_DOUBLE:
591 case GLSL_TYPE_UINT16:
592 case GLSL_TYPE_INT16:
593 case GLSL_TYPE_UINT8:
594 case GLSL_TYPE_INT8:
595 case GLSL_TYPE_UINT64:
596 case GLSL_TYPE_INT64:
597 if (type->is_matrix()) {
598 const glsl_type *col_type = type->column_type();
599 unsigned col_slots =
600 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
601 return type->matrix_columns * col_slots;
602 } else {
603 /* Regardless of size of vector, it gets a vec4. This is bad
604 * packing for things like floats, but otherwise arrays become a
605 * mess. Hopefully a later pass over the code can pack scalars
606 * down if appropriate.
607 */
608 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
609 }
610 case GLSL_TYPE_ARRAY:
611 assert(type->length > 0);
612 return type_size_xvec4(type->fields.array, as_vec4, bindless) *
613 type->length;
614 case GLSL_TYPE_STRUCT:
615 case GLSL_TYPE_INTERFACE:
616 size = 0;
617 for (i = 0; i < type->length; i++) {
618 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
619 bindless);
620 }
621 return size;
622 case GLSL_TYPE_SUBROUTINE:
623 return 1;
624
625 case GLSL_TYPE_SAMPLER:
626 /* Samplers take up no register space, since they're baked in at
627 * link time.
628 */
629 return bindless ? 1 : 0;
630 case GLSL_TYPE_ATOMIC_UINT:
631 return 0;
632 case GLSL_TYPE_IMAGE:
633 return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
634 case GLSL_TYPE_VOID:
635 case GLSL_TYPE_ERROR:
636 case GLSL_TYPE_FUNCTION:
637 unreachable("not reached");
638 }
639
640 return 0;
641 }
642
643 /**
644 * Returns the minimum number of vec4 elements needed to pack a type.
645 *
646 * For simple types, it will return 1 (a single vec4); for matrices, the
647 * number of columns; for array and struct, the sum of the vec4_size of
648 * each of its elements; and for sampler and atomic, zero.
649 *
650 * This method is useful to calculate how much register space is needed to
651 * store a particular type.
652 */
653 extern "C" int
654 type_size_vec4(const struct glsl_type *type, bool bindless)
655 {
656 return type_size_xvec4(type, true, bindless);
657 }
658
659 /**
660 * Returns the minimum number of dvec4 elements needed to pack a type.
661 *
662 * For simple types, it will return 1 (a single dvec4); for matrices, the
663 * number of columns; for array and struct, the sum of the dvec4_size of
664 * each of its elements; and for sampler and atomic, zero.
665 *
666 * This method is useful to calculate how much register space is needed to
667 * store a particular type.
668 *
669 * Measuring double-precision vertex inputs as dvec4 is required because
670 * ARB_vertex_attrib_64bit states that these uses the same number of locations
671 * than the single-precision version. That is, two consecutives dvec4 would be
672 * located in location "x" and location "x+1", not "x+2".
673 *
674 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
675 * remap_vs_attrs() will take in account both the location and also if the
676 * type fits in one or two vec4 slots.
677 */
678 extern "C" int
679 type_size_dvec4(const struct glsl_type *type, bool bindless)
680 {
681 return type_size_xvec4(type, false, bindless);
682 }
683
684 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
685 {
686 init();
687
688 this->file = VGRF;
689 this->nr = v->alloc.allocate(type_size_vec4(type, false));
690
691 if (type->is_array() || type->is_struct()) {
692 this->swizzle = BRW_SWIZZLE_NOOP;
693 } else {
694 this->swizzle = brw_swizzle_for_size(type->vector_elements);
695 }
696
697 this->type = brw_type_for_base_type(type);
698 }
699
700 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
701 {
702 assert(size > 0);
703
704 init();
705
706 this->file = VGRF;
707 this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
708
709 this->swizzle = BRW_SWIZZLE_NOOP;
710
711 this->type = brw_type_for_base_type(type);
712 }
713
714 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
715 {
716 init();
717
718 this->file = VGRF;
719 this->nr = v->alloc.allocate(type_size_vec4(type, false));
720
721 if (type->is_array() || type->is_struct()) {
722 this->writemask = WRITEMASK_XYZW;
723 } else {
724 this->writemask = (1 << type->vector_elements) - 1;
725 }
726
727 this->type = brw_type_for_base_type(type);
728 }
729
730 vec4_instruction *
731 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
732 src_reg src0, src_reg src1)
733 {
734 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
735 inst->conditional_mod = conditionalmod;
736 return inst;
737 }
738
739 /**
740 * Emits the instructions needed to perform a pull constant load. before_block
741 * and before_inst can be NULL in which case the instruction will be appended
742 * to the end of the instruction list.
743 */
744 void
745 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
746 src_reg surf_index,
747 src_reg offset_reg,
748 bblock_t *before_block,
749 vec4_instruction *before_inst)
750 {
751 assert((before_inst == NULL && before_block == NULL) ||
752 (before_inst && before_block));
753
754 vec4_instruction *pull;
755
756 if (devinfo->gen >= 9) {
757 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
758 src_reg header(this, glsl_type::uvec4_type, 2);
759
760 pull = new(mem_ctx)
761 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
762 dst_reg(header));
763
764 if (before_inst)
765 emit_before(before_block, before_inst, pull);
766 else
767 emit(pull);
768
769 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
770 offset_reg.type);
771 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
772
773 if (before_inst)
774 emit_before(before_block, before_inst, pull);
775 else
776 emit(pull);
777
778 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
779 dst,
780 surf_index,
781 header);
782 pull->mlen = 2;
783 pull->header_size = 1;
784 } else if (devinfo->gen >= 7) {
785 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
786
787 grf_offset.type = offset_reg.type;
788
789 pull = MOV(grf_offset, offset_reg);
790
791 if (before_inst)
792 emit_before(before_block, before_inst, pull);
793 else
794 emit(pull);
795
796 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
797 dst,
798 surf_index,
799 src_reg(grf_offset));
800 pull->mlen = 1;
801 } else {
802 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
803 dst,
804 surf_index,
805 offset_reg);
806 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
807 pull->mlen = 1;
808 }
809
810 if (before_inst)
811 emit_before(before_block, before_inst, pull);
812 else
813 emit(pull);
814 }
815
816 src_reg
817 vec4_visitor::emit_uniformize(const src_reg &src)
818 {
819 const src_reg chan_index(this, glsl_type::uint_type);
820 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
821 src.type);
822
823 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
824 ->force_writemask_all = true;
825 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
826 ->force_writemask_all = true;
827
828 return src_reg(dst);
829 }
830
831 src_reg
832 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
833 src_reg coordinate, src_reg surface)
834 {
835 vec4_instruction *inst =
836 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
837 dst_reg(this, glsl_type::uvec4_type));
838 inst->base_mrf = 2;
839 inst->src[1] = surface;
840 inst->src[2] = brw_imm_ud(0); /* sampler */
841
842 int param_base;
843
844 if (devinfo->gen >= 9) {
845 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
846 vec4_instruction *header_inst = new(mem_ctx)
847 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
848 dst_reg(MRF, inst->base_mrf));
849
850 emit(header_inst);
851
852 inst->mlen = 2;
853 inst->header_size = 1;
854 param_base = inst->base_mrf + 1;
855 } else {
856 inst->mlen = 1;
857 param_base = inst->base_mrf;
858 }
859
860 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
861 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
862 int zero_mask = 0xf & ~coord_mask;
863
864 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
865 coordinate));
866
867 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
868 brw_imm_d(0)));
869
870 emit(inst);
871 return src_reg(inst->dst);
872 }
873
874 bool
875 vec4_visitor::is_high_sampler(src_reg sampler)
876 {
877 if (devinfo->gen < 8 && !devinfo->is_haswell)
878 return false;
879
880 return sampler.file != IMM || sampler.ud >= 16;
881 }
882
883 void
884 vec4_visitor::emit_texture(ir_texture_opcode op,
885 dst_reg dest,
886 const glsl_type *dest_type,
887 src_reg coordinate,
888 int coord_components,
889 src_reg shadow_comparator,
890 src_reg lod, src_reg lod2,
891 src_reg sample_index,
892 uint32_t constant_offset,
893 src_reg offset_value,
894 src_reg mcs,
895 uint32_t surface,
896 src_reg surface_reg,
897 src_reg sampler_reg)
898 {
899 enum opcode opcode;
900 switch (op) {
901 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
902 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
903 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
904 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
905 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
906 SHADER_OPCODE_TXF_CMS); break;
907 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
908 case ir_tg4: opcode = offset_value.file != BAD_FILE
909 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
910 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
911 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
912 case ir_txb:
913 unreachable("TXB is not valid for vertex shaders.");
914 case ir_lod:
915 unreachable("LOD is not valid for vertex shaders.");
916 case ir_samples_identical: {
917 /* There are some challenges implementing this for vec4, and it seems
918 * unlikely to be used anyway. For now, just return false ways.
919 */
920 emit(MOV(dest, brw_imm_ud(0u)));
921 return;
922 }
923 default:
924 unreachable("Unrecognized tex op");
925 }
926
927 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
928
929 inst->offset = constant_offset;
930
931 /* The message header is necessary for:
932 * - Gen4 (always)
933 * - Gen9+ for selecting SIMD4x2
934 * - Texel offsets
935 * - Gather channel selection
936 * - Sampler indices too large to fit in a 4-bit value.
937 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
938 */
939 inst->header_size =
940 (devinfo->gen < 5 || devinfo->gen >= 9 ||
941 inst->offset != 0 || op == ir_tg4 ||
942 op == ir_texture_samples ||
943 is_high_sampler(sampler_reg)) ? 1 : 0;
944 inst->base_mrf = 2;
945 inst->mlen = inst->header_size;
946 inst->dst.writemask = WRITEMASK_XYZW;
947 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
948
949 inst->src[1] = surface_reg;
950 inst->src[2] = sampler_reg;
951
952 /* MRF for the first parameter */
953 int param_base = inst->base_mrf + inst->header_size;
954
955 if (op == ir_txs || op == ir_query_levels) {
956 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
957 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
958 inst->mlen++;
959 } else if (op == ir_texture_samples) {
960 inst->dst.writemask = WRITEMASK_X;
961 } else {
962 /* Load the coordinate */
963 /* FINISHME: gl_clamp_mask and saturate */
964 int coord_mask = (1 << coord_components) - 1;
965 int zero_mask = 0xf & ~coord_mask;
966
967 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
968 coordinate));
969 inst->mlen++;
970
971 if (zero_mask != 0) {
972 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
973 brw_imm_d(0)));
974 }
975 /* Load the shadow comparator */
976 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
977 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
978 WRITEMASK_X),
979 shadow_comparator));
980 inst->mlen++;
981 }
982
983 /* Load the LOD info */
984 if (op == ir_tex || op == ir_txl) {
985 int mrf, writemask;
986 if (devinfo->gen >= 5) {
987 mrf = param_base + 1;
988 if (shadow_comparator.file != BAD_FILE) {
989 writemask = WRITEMASK_Y;
990 /* mlen already incremented */
991 } else {
992 writemask = WRITEMASK_X;
993 inst->mlen++;
994 }
995 } else /* devinfo->gen == 4 */ {
996 mrf = param_base;
997 writemask = WRITEMASK_W;
998 }
999 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1000 } else if (op == ir_txf) {
1001 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1002 } else if (op == ir_txf_ms) {
1003 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1004 sample_index));
1005 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1006 /* MCS data is stored in the first two channels of ‘mcs’, but we
1007 * need to get it into the .y and .z channels of the second vec4
1008 * of params.
1009 */
1010 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1011 emit(MOV(dst_reg(MRF, param_base + 1,
1012 glsl_type::uint_type, WRITEMASK_YZ),
1013 mcs));
1014 } else if (devinfo->gen >= 7) {
1015 /* MCS data is in the first channel of `mcs`, but we need to get it into
1016 * the .y channel of the second vec4 of params, so replicate .x across
1017 * the whole vec4 and then mask off everything except .y
1018 */
1019 mcs.swizzle = BRW_SWIZZLE_XXXX;
1020 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1021 mcs));
1022 }
1023 inst->mlen++;
1024 } else if (op == ir_txd) {
1025 const brw_reg_type type = lod.type;
1026
1027 if (devinfo->gen >= 5) {
1028 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1029 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1030 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1031 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1032 inst->mlen++;
1033
1034 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1035 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1036 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1037 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1038 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1039 inst->mlen++;
1040
1041 if (shadow_comparator.file != BAD_FILE) {
1042 emit(MOV(dst_reg(MRF, param_base + 2,
1043 shadow_comparator.type, WRITEMASK_Z),
1044 shadow_comparator));
1045 }
1046 }
1047 } else /* devinfo->gen == 4 */ {
1048 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1049 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1050 inst->mlen += 2;
1051 }
1052 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1053 if (shadow_comparator.file != BAD_FILE) {
1054 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1055 shadow_comparator));
1056 }
1057
1058 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1059 offset_value));
1060 inst->mlen++;
1061 }
1062 }
1063
1064 emit(inst);
1065
1066 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1067 * spec requires layers.
1068 */
1069 if (op == ir_txs && devinfo->gen < 7) {
1070 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1071 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1072 src_reg(inst->dst), brw_imm_d(1));
1073 }
1074
1075 if (devinfo->gen == 6 && op == ir_tg4) {
1076 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1077 }
1078
1079 if (op == ir_query_levels) {
1080 /* # levels is in .w */
1081 src_reg swizzled(dest);
1082 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1083 SWIZZLE_W, SWIZZLE_W);
1084 emit(MOV(dest, swizzled));
1085 }
1086 }
1087
1088 /**
1089 * Apply workarounds for Gen6 gather with UINT/SINT
1090 */
1091 void
1092 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1093 {
1094 if (!wa)
1095 return;
1096
1097 int width = (wa & WA_8BIT) ? 8 : 16;
1098 dst_reg dst_f = dst;
1099 dst_f.type = BRW_REGISTER_TYPE_F;
1100
1101 /* Convert from UNORM to UINT */
1102 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1103 emit(MOV(dst, src_reg(dst_f)));
1104
1105 if (wa & WA_SIGN) {
1106 /* Reinterpret the UINT value as a signed INT value by
1107 * shifting the sign bit into place, then shifting back
1108 * preserving sign.
1109 */
1110 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1111 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1112 }
1113 }
1114
1115 void
1116 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1117 {
1118 unreachable("not reached");
1119 }
1120
1121 void
1122 vec4_visitor::gs_end_primitive()
1123 {
1124 unreachable("not reached");
1125 }
1126
1127 void
1128 vec4_visitor::emit_ndc_computation()
1129 {
1130 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1131 return;
1132
1133 /* Get the position */
1134 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1135
1136 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1137 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1138 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1139 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1140
1141 current_annotation = "NDC";
1142 dst_reg ndc_w = ndc;
1143 ndc_w.writemask = WRITEMASK_W;
1144 src_reg pos_w = pos;
1145 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1146 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1147
1148 dst_reg ndc_xyz = ndc;
1149 ndc_xyz.writemask = WRITEMASK_XYZ;
1150
1151 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1152 }
1153
1154 void
1155 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1156 {
1157 if (devinfo->gen < 6 &&
1158 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1159 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1160 devinfo->has_negative_rhw_bug)) {
1161 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1162 dst_reg header1_w = header1;
1163 header1_w.writemask = WRITEMASK_W;
1164
1165 emit(MOV(header1, brw_imm_ud(0u)));
1166
1167 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1168 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1169
1170 current_annotation = "Point size";
1171 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1172 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1173 }
1174
1175 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1176 current_annotation = "Clipping flags";
1177 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1178
1179 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1180 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1181 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1182 }
1183
1184 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1185 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1186 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1187 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1188 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1189 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1190 }
1191
1192 /* i965 clipping workaround:
1193 * 1) Test for -ve rhw
1194 * 2) If set,
1195 * set ndc = (0,0,0,0)
1196 * set ucp[6] = 1
1197 *
1198 * Later, clipping will detect ucp[6] and ensure the primitive is
1199 * clipped against all fixed planes.
1200 */
1201 if (devinfo->has_negative_rhw_bug &&
1202 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1203 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1204 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1205 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1206 vec4_instruction *inst;
1207 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1208 inst->predicate = BRW_PREDICATE_NORMAL;
1209 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1210 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1211 inst->predicate = BRW_PREDICATE_NORMAL;
1212 }
1213
1214 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1215 } else if (devinfo->gen < 6) {
1216 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1217 } else {
1218 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1219 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1220 dst_reg reg_w = reg;
1221 reg_w.writemask = WRITEMASK_W;
1222 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1223 reg_as_src.type = reg_w.type;
1224 reg_as_src.swizzle = brw_swizzle_for_size(1);
1225 emit(MOV(reg_w, reg_as_src));
1226 }
1227 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1228 dst_reg reg_y = reg;
1229 reg_y.writemask = WRITEMASK_Y;
1230 reg_y.type = BRW_REGISTER_TYPE_D;
1231 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1232 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1233 }
1234 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1235 dst_reg reg_z = reg;
1236 reg_z.writemask = WRITEMASK_Z;
1237 reg_z.type = BRW_REGISTER_TYPE_D;
1238 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1239 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1240 }
1241 }
1242 }
1243
1244 vec4_instruction *
1245 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1246 {
1247 assert(varying < VARYING_SLOT_MAX);
1248
1249 unsigned num_comps = output_num_components[varying][component];
1250 if (num_comps == 0)
1251 return NULL;
1252
1253 assert(output_reg[varying][component].type == reg.type);
1254 current_annotation = output_reg_annotation[varying];
1255 if (output_reg[varying][component].file != BAD_FILE) {
1256 src_reg src = src_reg(output_reg[varying][component]);
1257 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1258 reg.writemask =
1259 brw_writemask_for_component_packing(num_comps, component);
1260 return emit(MOV(reg, src));
1261 }
1262 return NULL;
1263 }
1264
1265 void
1266 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1267 {
1268 reg.type = BRW_REGISTER_TYPE_F;
1269 output_reg[varying][0].type = reg.type;
1270
1271 switch (varying) {
1272 case VARYING_SLOT_PSIZ:
1273 {
1274 /* PSIZ is always in slot 0, and is coupled with other flags. */
1275 current_annotation = "indices, point width, clip flags";
1276 emit_psiz_and_flags(reg);
1277 break;
1278 }
1279 case BRW_VARYING_SLOT_NDC:
1280 current_annotation = "NDC";
1281 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1282 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1283 break;
1284 case VARYING_SLOT_POS:
1285 current_annotation = "gl_Position";
1286 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1287 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1288 break;
1289 case VARYING_SLOT_EDGE: {
1290 /* This is present when doing unfilled polygons. We're supposed to copy
1291 * the edge flag from the user-provided vertex array
1292 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1293 * of that attribute (starts as 1.0f). This is then used in clipping to
1294 * determine which edges should be drawn as wireframe.
1295 */
1296 current_annotation = "edge flag";
1297 int edge_attr = util_bitcount64(nir->info.inputs_read &
1298 BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1299 emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1300 glsl_type::float_type, WRITEMASK_XYZW))));
1301 break;
1302 }
1303 case BRW_VARYING_SLOT_PAD:
1304 /* No need to write to this slot */
1305 break;
1306 default:
1307 for (int i = 0; i < 4; i++) {
1308 emit_generic_urb_slot(reg, varying, i);
1309 }
1310 break;
1311 }
1312 }
1313
1314 static unsigned
1315 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1316 {
1317 if (devinfo->gen >= 6) {
1318 /* URB data written (does not include the message header reg) must
1319 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1320 * section 5.4.3.2.2: URB_INTERLEAVED.
1321 *
1322 * URB entries are allocated on a multiple of 1024 bits, so an
1323 * extra 128 bits written here to make the end align to 256 is
1324 * no problem.
1325 */
1326 if ((mlen % 2) != 1)
1327 mlen++;
1328 }
1329
1330 return mlen;
1331 }
1332
1333
1334 /**
1335 * Generates the VUE payload plus the necessary URB write instructions to
1336 * output it.
1337 *
1338 * The VUE layout is documented in Volume 2a.
1339 */
1340 void
1341 vec4_visitor::emit_vertex()
1342 {
1343 /* MRF 0 is reserved for the debugger, so start with message header
1344 * in MRF 1.
1345 */
1346 int base_mrf = 1;
1347 int mrf = base_mrf;
1348 /* In the process of generating our URB write message contents, we
1349 * may need to unspill a register or load from an array. Those
1350 * reads would use MRFs 14-15.
1351 */
1352 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1353
1354 /* The following assertion verifies that max_usable_mrf causes an
1355 * even-numbered amount of URB write data, which will meet gen6's
1356 * requirements for length alignment.
1357 */
1358 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1359
1360 /* First mrf is the g0-based message header containing URB handles and
1361 * such.
1362 */
1363 emit_urb_write_header(mrf++);
1364
1365 if (devinfo->gen < 6) {
1366 emit_ndc_computation();
1367 }
1368
1369 /* We may need to split this up into several URB writes, so do them in a
1370 * loop.
1371 */
1372 int slot = 0;
1373 bool complete = false;
1374 do {
1375 /* URB offset is in URB row increments, and each of our MRFs is half of
1376 * one of those, since we're doing interleaved writes.
1377 */
1378 int offset = slot / 2;
1379
1380 mrf = base_mrf + 1;
1381 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1382 emit_urb_slot(dst_reg(MRF, mrf++),
1383 prog_data->vue_map.slot_to_varying[slot]);
1384
1385 /* If this was max_usable_mrf, we can't fit anything more into this
1386 * URB WRITE. Same thing if we reached the maximum length available.
1387 */
1388 if (mrf > max_usable_mrf ||
1389 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1390 slot++;
1391 break;
1392 }
1393 }
1394
1395 complete = slot >= prog_data->vue_map.num_slots;
1396 current_annotation = "URB write";
1397 vec4_instruction *inst = emit_urb_write_opcode(complete);
1398 inst->base_mrf = base_mrf;
1399 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1400 inst->offset += offset;
1401 } while(!complete);
1402 }
1403
1404
1405 src_reg
1406 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1407 src_reg *reladdr, int reg_offset)
1408 {
1409 /* Because we store the values to scratch interleaved like our
1410 * vertex data, we need to scale the vec4 index by 2.
1411 */
1412 int message_header_scale = 2;
1413
1414 /* Pre-gen6, the message header uses byte offsets instead of vec4
1415 * (16-byte) offset units.
1416 */
1417 if (devinfo->gen < 6)
1418 message_header_scale *= 16;
1419
1420 if (reladdr) {
1421 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1422 * to multiply the reladdr by 2. Notice that the reg_offset part
1423 * is in units of 16 bytes and is used to select the low/high 16-byte
1424 * chunk of a full dvec4, so we don't want to multiply that part.
1425 */
1426 src_reg index = src_reg(this, glsl_type::int_type);
1427 if (type_sz(inst->dst.type) < 8) {
1428 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1429 brw_imm_d(reg_offset)));
1430 emit_before(block, inst, MUL(dst_reg(index), index,
1431 brw_imm_d(message_header_scale)));
1432 } else {
1433 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1434 brw_imm_d(message_header_scale * 2)));
1435 emit_before(block, inst, ADD(dst_reg(index), index,
1436 brw_imm_d(reg_offset * message_header_scale)));
1437 }
1438 return index;
1439 } else {
1440 return brw_imm_d(reg_offset * message_header_scale);
1441 }
1442 }
1443
1444 /**
1445 * Emits an instruction before @inst to load the value named by @orig_src
1446 * from scratch space at @base_offset to @temp.
1447 *
1448 * @base_offset is measured in 32-byte units (the size of a register).
1449 */
1450 void
1451 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1452 dst_reg temp, src_reg orig_src,
1453 int base_offset)
1454 {
1455 assert(orig_src.offset % REG_SIZE == 0);
1456 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1457 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1458 reg_offset);
1459
1460 if (type_sz(orig_src.type) < 8) {
1461 emit_before(block, inst, SCRATCH_READ(temp, index));
1462 } else {
1463 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1464 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1465 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1466 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1467 vec4_instruction *last_read =
1468 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1469 emit_before(block, inst, last_read);
1470 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1471 }
1472 }
1473
1474 /**
1475 * Emits an instruction after @inst to store the value to be written
1476 * to @orig_dst to scratch space at @base_offset, from @temp.
1477 *
1478 * @base_offset is measured in 32-byte units (the size of a register).
1479 */
1480 void
1481 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1482 int base_offset)
1483 {
1484 assert(inst->dst.offset % REG_SIZE == 0);
1485 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1486 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1487 reg_offset);
1488
1489 /* Create a temporary register to store *inst's result in.
1490 *
1491 * We have to be careful in MOVing from our temporary result register in
1492 * the scratch write. If we swizzle from channels of the temporary that
1493 * weren't initialized, it will confuse live interval analysis, which will
1494 * make spilling fail to make progress.
1495 */
1496 bool is_64bit = type_sz(inst->dst.type) == 8;
1497 const glsl_type *alloc_type =
1498 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1499 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1500 inst->dst.type),
1501 brw_swizzle_for_mask(inst->dst.writemask));
1502
1503 if (!is_64bit) {
1504 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1505 inst->dst.writemask));
1506 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1507 if (inst->opcode != BRW_OPCODE_SEL)
1508 write->predicate = inst->predicate;
1509 write->ir = inst->ir;
1510 write->annotation = inst->annotation;
1511 inst->insert_after(block, write);
1512 } else {
1513 dst_reg shuffled = dst_reg(this, alloc_type);
1514 vec4_instruction *last =
1515 shuffle_64bit_data(shuffled, temp, true, block, inst);
1516 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1517
1518 uint8_t mask = 0;
1519 if (inst->dst.writemask & WRITEMASK_X)
1520 mask |= WRITEMASK_XY;
1521 if (inst->dst.writemask & WRITEMASK_Y)
1522 mask |= WRITEMASK_ZW;
1523 if (mask) {
1524 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1525
1526 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1527 if (inst->opcode != BRW_OPCODE_SEL)
1528 write->predicate = inst->predicate;
1529 write->ir = inst->ir;
1530 write->annotation = inst->annotation;
1531 last->insert_after(block, write);
1532 }
1533
1534 mask = 0;
1535 if (inst->dst.writemask & WRITEMASK_Z)
1536 mask |= WRITEMASK_XY;
1537 if (inst->dst.writemask & WRITEMASK_W)
1538 mask |= WRITEMASK_ZW;
1539 if (mask) {
1540 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1541
1542 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1543 reg_offset + 1);
1544 vec4_instruction *write =
1545 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1546 if (inst->opcode != BRW_OPCODE_SEL)
1547 write->predicate = inst->predicate;
1548 write->ir = inst->ir;
1549 write->annotation = inst->annotation;
1550 last->insert_after(block, write);
1551 }
1552 }
1553
1554 inst->dst.file = temp.file;
1555 inst->dst.nr = temp.nr;
1556 inst->dst.offset %= REG_SIZE;
1557 inst->dst.reladdr = NULL;
1558 }
1559
1560 /**
1561 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1562 * adds the scratch read(s) before \p inst. The function also checks for
1563 * recursive reladdr scratch accesses, issuing the corresponding scratch
1564 * loads and rewriting reladdr references accordingly.
1565 *
1566 * \return \p src if it did not require a scratch load, otherwise, the
1567 * register holding the result of the scratch load that the caller should
1568 * use to rewrite src.
1569 */
1570 src_reg
1571 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1572 vec4_instruction *inst, src_reg src)
1573 {
1574 /* Resolve recursive reladdr scratch access by calling ourselves
1575 * with src.reladdr
1576 */
1577 if (src.reladdr)
1578 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1579 *src.reladdr);
1580
1581 /* Now handle scratch access on src */
1582 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1583 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1584 glsl_type::dvec4_type : glsl_type::vec4_type);
1585 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1586 src.nr = temp.nr;
1587 src.offset %= REG_SIZE;
1588 src.reladdr = NULL;
1589 }
1590
1591 return src;
1592 }
1593
1594 /**
1595 * We can't generally support array access in GRF space, because a
1596 * single instruction's destination can only span 2 contiguous
1597 * registers. So, we send all GRF arrays that get variable index
1598 * access to scratch space.
1599 */
1600 void
1601 vec4_visitor::move_grf_array_access_to_scratch()
1602 {
1603 int scratch_loc[this->alloc.count];
1604 memset(scratch_loc, -1, sizeof(scratch_loc));
1605
1606 /* First, calculate the set of virtual GRFs that need to be punted
1607 * to scratch due to having any array access on them, and where in
1608 * scratch.
1609 */
1610 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1611 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1612 if (scratch_loc[inst->dst.nr] == -1) {
1613 scratch_loc[inst->dst.nr] = last_scratch;
1614 last_scratch += this->alloc.sizes[inst->dst.nr];
1615 }
1616
1617 for (src_reg *iter = inst->dst.reladdr;
1618 iter->reladdr;
1619 iter = iter->reladdr) {
1620 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1621 scratch_loc[iter->nr] = last_scratch;
1622 last_scratch += this->alloc.sizes[iter->nr];
1623 }
1624 }
1625 }
1626
1627 for (int i = 0 ; i < 3; i++) {
1628 for (src_reg *iter = &inst->src[i];
1629 iter->reladdr;
1630 iter = iter->reladdr) {
1631 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1632 scratch_loc[iter->nr] = last_scratch;
1633 last_scratch += this->alloc.sizes[iter->nr];
1634 }
1635 }
1636 }
1637 }
1638
1639 /* Now, for anything that will be accessed through scratch, rewrite
1640 * it to load/store. Note that this is a _safe list walk, because
1641 * we may generate a new scratch_write instruction after the one
1642 * we're processing.
1643 */
1644 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1645 /* Set up the annotation tracking for new generated instructions. */
1646 base_ir = inst->ir;
1647 current_annotation = inst->annotation;
1648
1649 /* First handle scratch access on the dst. Notice we have to handle
1650 * the case where the dst's reladdr also points to scratch space.
1651 */
1652 if (inst->dst.reladdr)
1653 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1654 *inst->dst.reladdr);
1655
1656 /* Now that we have handled any (possibly recursive) reladdr scratch
1657 * accesses for dst we can safely do the scratch write for dst itself
1658 */
1659 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1660 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1661
1662 /* Now handle scratch access on any src. In this case, since inst->src[i]
1663 * already is a src_reg, we can just call emit_resolve_reladdr with
1664 * inst->src[i] and it will take care of handling scratch loads for
1665 * both src and src.reladdr (recursively).
1666 */
1667 for (int i = 0 ; i < 3; i++) {
1668 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1669 inst->src[i]);
1670 }
1671 }
1672 }
1673
1674 /**
1675 * Emits an instruction before @inst to load the value named by @orig_src
1676 * from the pull constant buffer (surface) at @base_offset to @temp.
1677 */
1678 void
1679 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1680 dst_reg temp, src_reg orig_src,
1681 int base_offset, src_reg indirect)
1682 {
1683 assert(orig_src.offset % 16 == 0);
1684 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1685
1686 /* For 64bit loads we need to emit two 32-bit load messages and we also
1687 * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1688 * that we emit the 32-bit loads into a temporary and we shuffle the result
1689 * into the original destination.
1690 */
1691 dst_reg orig_temp = temp;
1692 bool is_64bit = type_sz(orig_src.type) == 8;
1693 if (is_64bit) {
1694 assert(type_sz(temp.type) == 8);
1695 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1696 temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1697 }
1698
1699 src_reg src = orig_src;
1700 for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1701 int reg_offset = base_offset + src.offset / 16;
1702
1703 src_reg offset;
1704 if (indirect.file != BAD_FILE) {
1705 offset = src_reg(this, glsl_type::uint_type);
1706 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1707 brw_imm_ud(reg_offset * 16)));
1708 } else if (devinfo->gen >= 8) {
1709 /* Store the offset in a GRF so we can send-from-GRF. */
1710 offset = src_reg(this, glsl_type::uint_type);
1711 emit_before(block, inst, MOV(dst_reg(offset),
1712 brw_imm_ud(reg_offset * 16)));
1713 } else {
1714 offset = brw_imm_d(reg_offset * 16);
1715 }
1716
1717 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1718 brw_imm_ud(index),
1719 offset,
1720 block, inst);
1721
1722 src = byte_offset(src, 16);
1723 }
1724
1725 if (is_64bit) {
1726 temp = retype(temp, BRW_REGISTER_TYPE_DF);
1727 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1728 }
1729 }
1730
1731 /**
1732 * Implements array access of uniforms by inserting a
1733 * PULL_CONSTANT_LOAD instruction.
1734 *
1735 * Unlike temporary GRF array access (where we don't support it due to
1736 * the difficulty of doing relative addressing on instruction
1737 * destinations), we could potentially do array access of uniforms
1738 * that were loaded in GRF space as push constants. In real-world
1739 * usage we've seen, though, the arrays being used are always larger
1740 * than we could load as push constants, so just always move all
1741 * uniform array access out to a pull constant buffer.
1742 */
1743 void
1744 vec4_visitor::move_uniform_array_access_to_pull_constants()
1745 {
1746 /* The vulkan dirver doesn't support pull constants other than UBOs so
1747 * everything has to be pushed regardless.
1748 */
1749 if (!compiler->supports_pull_constants) {
1750 split_uniform_registers();
1751 return;
1752 }
1753
1754 /* Allocate the pull_params array */
1755 assert(stage_prog_data->nr_pull_params == 0);
1756 stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1757 this->uniforms * 4);
1758
1759 int pull_constant_loc[this->uniforms];
1760 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1761
1762 /* First, walk through the instructions and determine which things need to
1763 * be pulled. We mark something as needing to be pulled by setting
1764 * pull_constant_loc to 0.
1765 */
1766 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1767 /* We only care about MOV_INDIRECT of a uniform */
1768 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1769 inst->src[0].file != UNIFORM)
1770 continue;
1771
1772 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1773
1774 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1775 pull_constant_loc[uniform_nr + j] = 0;
1776 }
1777
1778 /* Next, we walk the list of uniforms and assign real pull constant
1779 * locations and set their corresponding entries in pull_param.
1780 */
1781 for (int j = 0; j < this->uniforms; j++) {
1782 if (pull_constant_loc[j] < 0)
1783 continue;
1784
1785 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1786
1787 for (int i = 0; i < 4; i++) {
1788 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1789 = stage_prog_data->param[j * 4 + i];
1790 }
1791 }
1792
1793 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1794 * instructions to actual uniform pulls.
1795 */
1796 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1797 /* We only care about MOV_INDIRECT of a uniform */
1798 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1799 inst->src[0].file != UNIFORM)
1800 continue;
1801
1802 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1803
1804 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1805
1806 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1807 pull_constant_loc[uniform_nr], inst->src[1]);
1808 inst->remove(block);
1809 }
1810
1811 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1812 * no need to track them as larger-than-vec4 objects. This will be
1813 * relied on in cutting out unused uniform vectors from push
1814 * constants.
1815 */
1816 split_uniform_registers();
1817 }
1818
1819 void
1820 vec4_visitor::resolve_ud_negate(src_reg *reg)
1821 {
1822 if (reg->type != BRW_REGISTER_TYPE_UD ||
1823 !reg->negate)
1824 return;
1825
1826 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1827 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1828 *reg = temp;
1829 }
1830
1831 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1832 void *log_data,
1833 const struct brw_sampler_prog_key_data *key_tex,
1834 struct brw_vue_prog_data *prog_data,
1835 const nir_shader *shader,
1836 void *mem_ctx,
1837 bool no_spills,
1838 int shader_time_index)
1839 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1840 key_tex(key_tex),
1841 prog_data(prog_data),
1842 fail_msg(NULL),
1843 first_non_payload_grf(0),
1844 live_analysis(this), performance_analysis(this),
1845 need_all_constants_in_pull_buffer(false),
1846 no_spills(no_spills),
1847 shader_time_index(shader_time_index),
1848 last_scratch(0)
1849 {
1850 this->failed = false;
1851
1852 this->base_ir = NULL;
1853 this->current_annotation = NULL;
1854 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1855
1856 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1857
1858 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1859
1860 this->uniforms = 0;
1861
1862 this->nir_locals = NULL;
1863 this->nir_ssa_values = NULL;
1864 }
1865
1866
1867 void
1868 vec4_visitor::fail(const char *format, ...)
1869 {
1870 va_list va;
1871 char *msg;
1872
1873 if (failed)
1874 return;
1875
1876 failed = true;
1877
1878 va_start(va, format);
1879 msg = ralloc_vasprintf(mem_ctx, format, va);
1880 va_end(va);
1881 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1882
1883 this->fail_msg = msg;
1884
1885 if (debug_enabled) {
1886 fprintf(stderr, "%s", msg);
1887 }
1888 }
1889
1890 } /* namespace brw */