nir/i965/freedreno/vc4: add a bindless bool to type size functions
[mesa.git] / src / intel / compiler / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "util/u_math.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->eot = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->exec_size = 8;
59 this->group = 0;
60 this->size_written = (dst.file == BAD_FILE ?
61 0 : this->exec_size * type_sz(dst.type));
62 this->annotation = NULL;
63 }
64
65 vec4_instruction *
66 vec4_visitor::emit(vec4_instruction *inst)
67 {
68 inst->ir = this->base_ir;
69 inst->annotation = this->current_annotation;
70
71 this->instructions.push_tail(inst);
72
73 return inst;
74 }
75
76 vec4_instruction *
77 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
78 vec4_instruction *new_inst)
79 {
80 new_inst->ir = inst->ir;
81 new_inst->annotation = inst->annotation;
82
83 inst->insert_before(block, new_inst);
84
85 return inst;
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
90 const src_reg &src1, const src_reg &src2)
91 {
92 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
93 }
94
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
98 const src_reg &src1)
99 {
100 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
101 }
102
103 vec4_instruction *
104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
105 {
106 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
107 }
108
109 vec4_instruction *
110 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
111 {
112 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
113 }
114
115 vec4_instruction *
116 vec4_visitor::emit(enum opcode opcode)
117 {
118 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
119 }
120
121 #define ALU1(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
124 { \
125 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
126 }
127
128 #define ALU2(op) \
129 vec4_instruction * \
130 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
131 const src_reg &src1) \
132 { \
133 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
134 src0, src1); \
135 }
136
137 #define ALU2_ACC(op) \
138 vec4_instruction * \
139 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
140 const src_reg &src1) \
141 { \
142 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
143 BRW_OPCODE_##op, dst, src0, src1); \
144 inst->writes_accumulator = true; \
145 return inst; \
146 }
147
148 #define ALU3(op) \
149 vec4_instruction * \
150 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
151 const src_reg &src1, const src_reg &src2) \
152 { \
153 assert(devinfo->gen >= 6); \
154 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
155 src0, src1, src2); \
156 }
157
158 ALU1(NOT)
159 ALU1(MOV)
160 ALU1(FRC)
161 ALU1(RNDD)
162 ALU1(RNDE)
163 ALU1(RNDZ)
164 ALU1(F32TO16)
165 ALU1(F16TO32)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2_ACC(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(DP3)
173 ALU2(DP4)
174 ALU2(DPH)
175 ALU2(SHL)
176 ALU2(SHR)
177 ALU2(ASR)
178 ALU3(LRP)
179 ALU1(BFREV)
180 ALU3(BFE)
181 ALU2(BFI1)
182 ALU3(BFI2)
183 ALU1(FBH)
184 ALU1(FBL)
185 ALU1(CBIT)
186 ALU3(MAD)
187 ALU2_ACC(ADDC)
188 ALU2_ACC(SUBB)
189 ALU2(MAC)
190 ALU1(DIM)
191
192 /** Gen4 predicated IF. */
193 vec4_instruction *
194 vec4_visitor::IF(enum brw_predicate predicate)
195 {
196 vec4_instruction *inst;
197
198 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
199 inst->predicate = predicate;
200
201 return inst;
202 }
203
204 /** Gen6 IF with embedded comparison. */
205 vec4_instruction *
206 vec4_visitor::IF(src_reg src0, src_reg src1,
207 enum brw_conditional_mod condition)
208 {
209 assert(devinfo->gen == 6);
210
211 vec4_instruction *inst;
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
217 src0, src1);
218 inst->conditional_mod = condition;
219
220 return inst;
221 }
222
223 /**
224 * CMP: Sets the low bit of the destination channels with the result
225 * of the comparison, while the upper bits are undefined, and updates
226 * the flag register with the packed 16 bits of the result.
227 */
228 vec4_instruction *
229 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
230 enum brw_conditional_mod condition)
231 {
232 vec4_instruction *inst;
233
234 /* Take the instruction:
235 *
236 * CMP null<d> src0<f> src1<f>
237 *
238 * Original gen4 does type conversion to the destination type before
239 * comparison, producing garbage results for floating point comparisons.
240 *
241 * The destination type doesn't matter on newer generations, so we set the
242 * type to match src0 so we can compact the instruction.
243 */
244 dst.type = src0.type;
245
246 resolve_ud_negate(&src0);
247 resolve_ud_negate(&src1);
248
249 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
250 inst->conditional_mod = condition;
251
252 return inst;
253 }
254
255 vec4_instruction *
256 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
257 {
258 vec4_instruction *inst;
259
260 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
261 dst, index);
262 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
263 inst->mlen = 2;
264
265 return inst;
266 }
267
268 vec4_instruction *
269 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
270 const src_reg &index)
271 {
272 vec4_instruction *inst;
273
274 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
275 dst, src, index);
276 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
277 inst->mlen = 3;
278
279 return inst;
280 }
281
282 src_reg
283 vec4_visitor::fix_3src_operand(const src_reg &src)
284 {
285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286 * able to use vertical stride of zero to replicate the vec4 uniform, like
287 *
288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289 *
290 * But you can't, since vertical stride is always four in three-source
291 * instructions. Instead, insert a MOV instruction to do the replication so
292 * that the three-source instruction can consume it.
293 */
294
295 /* The MOV is only needed if the source is a uniform or immediate. */
296 if (src.file != UNIFORM && src.file != IMM)
297 return src;
298
299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
305 return src_reg(expanded);
306 }
307
308 src_reg
309 vec4_visitor::resolve_source_modifiers(const src_reg &src)
310 {
311 if (!src.abs && !src.negate)
312 return src;
313
314 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
315 resolved.type = src.type;
316 emit(MOV(resolved, src));
317
318 return src_reg(resolved);
319 }
320
321 src_reg
322 vec4_visitor::fix_math_operand(const src_reg &src)
323 {
324 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
325 return src;
326
327 /* The gen6 math instruction ignores the source modifiers --
328 * swizzle, abs, negate, and at least some parts of the register
329 * region description.
330 *
331 * Rather than trying to enumerate all these cases, *always* expand the
332 * operand to a temp GRF for gen6.
333 *
334 * For gen7, keep the operand as-is, except if immediate, which gen7 still
335 * can't use.
336 */
337
338 if (devinfo->gen == 7 && src.file != IMM)
339 return src;
340
341 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
342 expanded.type = src.type;
343 emit(MOV(expanded, src));
344 return src_reg(expanded);
345 }
346
347 vec4_instruction *
348 vec4_visitor::emit_math(enum opcode opcode,
349 const dst_reg &dst,
350 const src_reg &src0, const src_reg &src1)
351 {
352 vec4_instruction *math =
353 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
354
355 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
356 /* MATH on Gen6 must be align1, so we can't do writemasks. */
357 math->dst = dst_reg(this, glsl_type::vec4_type);
358 math->dst.type = dst.type;
359 math = emit(MOV(dst, src_reg(math->dst)));
360 } else if (devinfo->gen < 6) {
361 math->base_mrf = 1;
362 math->mlen = src1.file == BAD_FILE ? 1 : 2;
363 }
364
365 return math;
366 }
367
368 void
369 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
370 {
371 if (devinfo->gen < 7) {
372 unreachable("ir_unop_pack_half_2x16 should be lowered");
373 }
374
375 assert(dst.type == BRW_REGISTER_TYPE_UD);
376 assert(src0.type == BRW_REGISTER_TYPE_F);
377
378 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
379 *
380 * Because this instruction does not have a 16-bit floating-point type,
381 * the destination data type must be Word (W).
382 *
383 * The destination must be DWord-aligned and specify a horizontal stride
384 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
385 * each destination channel and the upper word is not modified.
386 *
387 * The above restriction implies that the f32to16 instruction must use
388 * align1 mode, because only in align1 mode is it possible to specify
389 * horizontal stride. We choose here to defy the hardware docs and emit
390 * align16 instructions.
391 *
392 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
393 * instructions. I was partially successful in that the code passed all
394 * tests. However, the code was dubiously correct and fragile, and the
395 * tests were not harsh enough to probe that frailty. Not trusting the
396 * code, I chose instead to remain in align16 mode in defiance of the hw
397 * docs).
398 *
399 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
400 * simulator, emitting a f32to16 in align16 mode with UD as destination
401 * data type is safe. The behavior differs from that specified in the PRM
402 * in that the upper word of each destination channel is cleared to 0.
403 */
404
405 dst_reg tmp_dst(this, glsl_type::uvec2_type);
406 src_reg tmp_src(tmp_dst);
407
408 #if 0
409 /* Verify the undocumented behavior on which the following instructions
410 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
411 * then the result of the bit-or instruction below will be incorrect.
412 *
413 * You should inspect the disasm output in order to verify that the MOV is
414 * not optimized away.
415 */
416 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
417 #endif
418
419 /* Give tmp the form below, where "." means untouched.
420 *
421 * w z y x w z y x
422 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
423 *
424 * That the upper word of each write-channel be 0 is required for the
425 * following bit-shift and bit-or instructions to work. Note that this
426 * relies on the undocumented hardware behavior mentioned above.
427 */
428 tmp_dst.writemask = WRITEMASK_XY;
429 emit(F32TO16(tmp_dst, src0));
430
431 /* Give the write-channels of dst the form:
432 * 0xhhhh0000
433 */
434 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
435 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
436
437 /* Finally, give the write-channels of dst the form of packHalf2x16's
438 * output:
439 * 0xhhhhllll
440 */
441 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
442 emit(OR(dst, src_reg(dst), tmp_src));
443 }
444
445 void
446 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
447 {
448 if (devinfo->gen < 7) {
449 unreachable("ir_unop_unpack_half_2x16 should be lowered");
450 }
451
452 assert(dst.type == BRW_REGISTER_TYPE_F);
453 assert(src0.type == BRW_REGISTER_TYPE_UD);
454
455 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
456 *
457 * Because this instruction does not have a 16-bit floating-point type,
458 * the source data type must be Word (W). The destination type must be
459 * F (Float).
460 *
461 * To use W as the source data type, we must adjust horizontal strides,
462 * which is only possible in align1 mode. All my [chadv] attempts at
463 * emitting align1 instructions for unpackHalf2x16 failed to pass the
464 * Piglit tests, so I gave up.
465 *
466 * I've verified that, on gen7 hardware and the simulator, it is safe to
467 * emit f16to32 in align16 mode with UD as source data type.
468 */
469
470 dst_reg tmp_dst(this, glsl_type::uvec2_type);
471 src_reg tmp_src(tmp_dst);
472
473 tmp_dst.writemask = WRITEMASK_X;
474 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
475
476 tmp_dst.writemask = WRITEMASK_Y;
477 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
478
479 dst.writemask = WRITEMASK_XY;
480 emit(F16TO32(dst, tmp_src));
481 }
482
483 void
484 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
485 {
486 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
487 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
488 * is not suitable to generate the shift values, but we can use the packed
489 * vector float and a type-converting MOV.
490 */
491 dst_reg shift(this, glsl_type::uvec4_type);
492 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
493
494 dst_reg shifted(this, glsl_type::uvec4_type);
495 src0.swizzle = BRW_SWIZZLE_XXXX;
496 emit(SHR(shifted, src0, src_reg(shift)));
497
498 shifted.type = BRW_REGISTER_TYPE_UB;
499 dst_reg f(this, glsl_type::vec4_type);
500 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
501
502 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
503 }
504
505 void
506 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
507 {
508 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
509 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
510 * is not suitable to generate the shift values, but we can use the packed
511 * vector float and a type-converting MOV.
512 */
513 dst_reg shift(this, glsl_type::uvec4_type);
514 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
515
516 dst_reg shifted(this, glsl_type::uvec4_type);
517 src0.swizzle = BRW_SWIZZLE_XXXX;
518 emit(SHR(shifted, src0, src_reg(shift)));
519
520 shifted.type = BRW_REGISTER_TYPE_B;
521 dst_reg f(this, glsl_type::vec4_type);
522 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
523
524 dst_reg scaled(this, glsl_type::vec4_type);
525 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
526
527 dst_reg max(this, glsl_type::vec4_type);
528 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
529 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
530 }
531
532 void
533 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
534 {
535 dst_reg saturated(this, glsl_type::vec4_type);
536 vec4_instruction *inst = emit(MOV(saturated, src0));
537 inst->saturate = true;
538
539 dst_reg scaled(this, glsl_type::vec4_type);
540 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
541
542 dst_reg rounded(this, glsl_type::vec4_type);
543 emit(RNDE(rounded, src_reg(scaled)));
544
545 dst_reg u(this, glsl_type::uvec4_type);
546 emit(MOV(u, src_reg(rounded)));
547
548 src_reg bytes(u);
549 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
550 }
551
552 void
553 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
554 {
555 dst_reg max(this, glsl_type::vec4_type);
556 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
557
558 dst_reg min(this, glsl_type::vec4_type);
559 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
560
561 dst_reg scaled(this, glsl_type::vec4_type);
562 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
563
564 dst_reg rounded(this, glsl_type::vec4_type);
565 emit(RNDE(rounded, src_reg(scaled)));
566
567 dst_reg i(this, glsl_type::ivec4_type);
568 emit(MOV(i, src_reg(rounded)));
569
570 src_reg bytes(i);
571 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
572 }
573
574 /*
575 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
576 * false) elements needed to pack a type.
577 */
578 static int
579 type_size_xvec4(const struct glsl_type *type, bool as_vec4, bool bindless)
580 {
581 unsigned int i;
582 int size;
583
584 switch (type->base_type) {
585 case GLSL_TYPE_UINT:
586 case GLSL_TYPE_INT:
587 case GLSL_TYPE_FLOAT:
588 case GLSL_TYPE_FLOAT16:
589 case GLSL_TYPE_BOOL:
590 case GLSL_TYPE_DOUBLE:
591 case GLSL_TYPE_UINT16:
592 case GLSL_TYPE_INT16:
593 case GLSL_TYPE_UINT8:
594 case GLSL_TYPE_INT8:
595 case GLSL_TYPE_UINT64:
596 case GLSL_TYPE_INT64:
597 if (type->is_matrix()) {
598 const glsl_type *col_type = type->column_type();
599 unsigned col_slots =
600 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
601 return type->matrix_columns * col_slots;
602 } else {
603 /* Regardless of size of vector, it gets a vec4. This is bad
604 * packing for things like floats, but otherwise arrays become a
605 * mess. Hopefully a later pass over the code can pack scalars
606 * down if appropriate.
607 */
608 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
609 }
610 case GLSL_TYPE_ARRAY:
611 assert(type->length > 0);
612 return type_size_xvec4(type->fields.array, as_vec4, bindless) *
613 type->length;
614 case GLSL_TYPE_STRUCT:
615 case GLSL_TYPE_INTERFACE:
616 size = 0;
617 for (i = 0; i < type->length; i++) {
618 size += type_size_xvec4(type->fields.structure[i].type, as_vec4,
619 bindless);
620 }
621 return size;
622 case GLSL_TYPE_SUBROUTINE:
623 return 1;
624
625 case GLSL_TYPE_SAMPLER:
626 /* Samplers take up no register space, since they're baked in at
627 * link time.
628 */
629 return bindless ? 1 : 0;
630 case GLSL_TYPE_ATOMIC_UINT:
631 return 0;
632 case GLSL_TYPE_IMAGE:
633 return bindless ? 1 : DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
634 case GLSL_TYPE_VOID:
635 case GLSL_TYPE_ERROR:
636 case GLSL_TYPE_FUNCTION:
637 unreachable("not reached");
638 }
639
640 return 0;
641 }
642
643 /**
644 * Returns the minimum number of vec4 elements needed to pack a type.
645 *
646 * For simple types, it will return 1 (a single vec4); for matrices, the
647 * number of columns; for array and struct, the sum of the vec4_size of
648 * each of its elements; and for sampler and atomic, zero.
649 *
650 * This method is useful to calculate how much register space is needed to
651 * store a particular type.
652 */
653 extern "C" int
654 type_size_vec4(const struct glsl_type *type, bool bindless)
655 {
656 return type_size_xvec4(type, true, bindless);
657 }
658
659 /**
660 * Returns the minimum number of dvec4 elements needed to pack a type.
661 *
662 * For simple types, it will return 1 (a single dvec4); for matrices, the
663 * number of columns; for array and struct, the sum of the dvec4_size of
664 * each of its elements; and for sampler and atomic, zero.
665 *
666 * This method is useful to calculate how much register space is needed to
667 * store a particular type.
668 *
669 * Measuring double-precision vertex inputs as dvec4 is required because
670 * ARB_vertex_attrib_64bit states that these uses the same number of locations
671 * than the single-precision version. That is, two consecutives dvec4 would be
672 * located in location "x" and location "x+1", not "x+2".
673 *
674 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
675 * remap_vs_attrs() will take in account both the location and also if the
676 * type fits in one or two vec4 slots.
677 */
678 extern "C" int
679 type_size_dvec4(const struct glsl_type *type, bool bindless)
680 {
681 return type_size_xvec4(type, false, bindless);
682 }
683
684 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
685 {
686 init();
687
688 this->file = VGRF;
689 this->nr = v->alloc.allocate(type_size_vec4(type, false));
690
691 if (type->is_array() || type->is_struct()) {
692 this->swizzle = BRW_SWIZZLE_NOOP;
693 } else {
694 this->swizzle = brw_swizzle_for_size(type->vector_elements);
695 }
696
697 this->type = brw_type_for_base_type(type);
698 }
699
700 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
701 {
702 assert(size > 0);
703
704 init();
705
706 this->file = VGRF;
707 this->nr = v->alloc.allocate(type_size_vec4(type, false) * size);
708
709 this->swizzle = BRW_SWIZZLE_NOOP;
710
711 this->type = brw_type_for_base_type(type);
712 }
713
714 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
715 {
716 init();
717
718 this->file = VGRF;
719 this->nr = v->alloc.allocate(type_size_vec4(type, false));
720
721 if (type->is_array() || type->is_struct()) {
722 this->writemask = WRITEMASK_XYZW;
723 } else {
724 this->writemask = (1 << type->vector_elements) - 1;
725 }
726
727 this->type = brw_type_for_base_type(type);
728 }
729
730 vec4_instruction *
731 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
732 src_reg src0, src_reg src1)
733 {
734 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
735 inst->conditional_mod = conditionalmod;
736 return inst;
737 }
738
739 vec4_instruction *
740 vec4_visitor::emit_lrp(const dst_reg &dst,
741 const src_reg &x, const src_reg &y, const src_reg &a)
742 {
743 if (devinfo->gen >= 6 && devinfo->gen <= 10) {
744 /* Note that the instruction's argument order is reversed from GLSL
745 * and the IR.
746 */
747 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
748 fix_3src_operand(x)));
749 } else {
750 /* Earlier generations don't support three source operations, so we
751 * need to emit x*(1-a) + y*a.
752 */
753 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
754 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
755 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
756 y_times_a.writemask = dst.writemask;
757 one_minus_a.writemask = dst.writemask;
758 x_times_one_minus_a.writemask = dst.writemask;
759
760 emit(MUL(y_times_a, y, a));
761 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
762 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
763 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
764 }
765 }
766
767 /**
768 * Emits the instructions needed to perform a pull constant load. before_block
769 * and before_inst can be NULL in which case the instruction will be appended
770 * to the end of the instruction list.
771 */
772 void
773 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
774 src_reg surf_index,
775 src_reg offset_reg,
776 bblock_t *before_block,
777 vec4_instruction *before_inst)
778 {
779 assert((before_inst == NULL && before_block == NULL) ||
780 (before_inst && before_block));
781
782 vec4_instruction *pull;
783
784 if (devinfo->gen >= 9) {
785 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
786 src_reg header(this, glsl_type::uvec4_type, 2);
787
788 pull = new(mem_ctx)
789 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
790 dst_reg(header));
791
792 if (before_inst)
793 emit_before(before_block, before_inst, pull);
794 else
795 emit(pull);
796
797 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
798 offset_reg.type);
799 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
800
801 if (before_inst)
802 emit_before(before_block, before_inst, pull);
803 else
804 emit(pull);
805
806 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
807 dst,
808 surf_index,
809 header);
810 pull->mlen = 2;
811 pull->header_size = 1;
812 } else if (devinfo->gen >= 7) {
813 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
814
815 grf_offset.type = offset_reg.type;
816
817 pull = MOV(grf_offset, offset_reg);
818
819 if (before_inst)
820 emit_before(before_block, before_inst, pull);
821 else
822 emit(pull);
823
824 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
825 dst,
826 surf_index,
827 src_reg(grf_offset));
828 pull->mlen = 1;
829 } else {
830 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
831 dst,
832 surf_index,
833 offset_reg);
834 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
835 pull->mlen = 1;
836 }
837
838 if (before_inst)
839 emit_before(before_block, before_inst, pull);
840 else
841 emit(pull);
842 }
843
844 src_reg
845 vec4_visitor::emit_uniformize(const src_reg &src)
846 {
847 const src_reg chan_index(this, glsl_type::uint_type);
848 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
849 src.type);
850
851 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
852 ->force_writemask_all = true;
853 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
854 ->force_writemask_all = true;
855
856 return src_reg(dst);
857 }
858
859 src_reg
860 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
861 src_reg coordinate, src_reg surface)
862 {
863 vec4_instruction *inst =
864 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
865 dst_reg(this, glsl_type::uvec4_type));
866 inst->base_mrf = 2;
867 inst->src[1] = surface;
868 inst->src[2] = brw_imm_ud(0); /* sampler */
869
870 int param_base;
871
872 if (devinfo->gen >= 9) {
873 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
874 vec4_instruction *header_inst = new(mem_ctx)
875 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
876 dst_reg(MRF, inst->base_mrf));
877
878 emit(header_inst);
879
880 inst->mlen = 2;
881 inst->header_size = 1;
882 param_base = inst->base_mrf + 1;
883 } else {
884 inst->mlen = 1;
885 param_base = inst->base_mrf;
886 }
887
888 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
889 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
890 int zero_mask = 0xf & ~coord_mask;
891
892 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
893 coordinate));
894
895 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
896 brw_imm_d(0)));
897
898 emit(inst);
899 return src_reg(inst->dst);
900 }
901
902 bool
903 vec4_visitor::is_high_sampler(src_reg sampler)
904 {
905 if (devinfo->gen < 8 && !devinfo->is_haswell)
906 return false;
907
908 return sampler.file != IMM || sampler.ud >= 16;
909 }
910
911 void
912 vec4_visitor::emit_texture(ir_texture_opcode op,
913 dst_reg dest,
914 const glsl_type *dest_type,
915 src_reg coordinate,
916 int coord_components,
917 src_reg shadow_comparator,
918 src_reg lod, src_reg lod2,
919 src_reg sample_index,
920 uint32_t constant_offset,
921 src_reg offset_value,
922 src_reg mcs,
923 uint32_t surface,
924 src_reg surface_reg,
925 src_reg sampler_reg)
926 {
927 enum opcode opcode;
928 switch (op) {
929 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
930 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
931 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
932 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
933 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
934 SHADER_OPCODE_TXF_CMS); break;
935 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
936 case ir_tg4: opcode = offset_value.file != BAD_FILE
937 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
938 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
939 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
940 case ir_txb:
941 unreachable("TXB is not valid for vertex shaders.");
942 case ir_lod:
943 unreachable("LOD is not valid for vertex shaders.");
944 case ir_samples_identical: {
945 /* There are some challenges implementing this for vec4, and it seems
946 * unlikely to be used anyway. For now, just return false ways.
947 */
948 emit(MOV(dest, brw_imm_ud(0u)));
949 return;
950 }
951 default:
952 unreachable("Unrecognized tex op");
953 }
954
955 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
956
957 inst->offset = constant_offset;
958
959 /* The message header is necessary for:
960 * - Gen4 (always)
961 * - Gen9+ for selecting SIMD4x2
962 * - Texel offsets
963 * - Gather channel selection
964 * - Sampler indices too large to fit in a 4-bit value.
965 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
966 */
967 inst->header_size =
968 (devinfo->gen < 5 || devinfo->gen >= 9 ||
969 inst->offset != 0 || op == ir_tg4 ||
970 op == ir_texture_samples ||
971 is_high_sampler(sampler_reg)) ? 1 : 0;
972 inst->base_mrf = 2;
973 inst->mlen = inst->header_size;
974 inst->dst.writemask = WRITEMASK_XYZW;
975 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
976
977 inst->src[1] = surface_reg;
978 inst->src[2] = sampler_reg;
979
980 /* MRF for the first parameter */
981 int param_base = inst->base_mrf + inst->header_size;
982
983 if (op == ir_txs || op == ir_query_levels) {
984 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
985 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
986 inst->mlen++;
987 } else if (op == ir_texture_samples) {
988 inst->dst.writemask = WRITEMASK_X;
989 } else {
990 /* Load the coordinate */
991 /* FINISHME: gl_clamp_mask and saturate */
992 int coord_mask = (1 << coord_components) - 1;
993 int zero_mask = 0xf & ~coord_mask;
994
995 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
996 coordinate));
997 inst->mlen++;
998
999 if (zero_mask != 0) {
1000 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1001 brw_imm_d(0)));
1002 }
1003 /* Load the shadow comparator */
1004 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1005 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1006 WRITEMASK_X),
1007 shadow_comparator));
1008 inst->mlen++;
1009 }
1010
1011 /* Load the LOD info */
1012 if (op == ir_tex || op == ir_txl) {
1013 int mrf, writemask;
1014 if (devinfo->gen >= 5) {
1015 mrf = param_base + 1;
1016 if (shadow_comparator.file != BAD_FILE) {
1017 writemask = WRITEMASK_Y;
1018 /* mlen already incremented */
1019 } else {
1020 writemask = WRITEMASK_X;
1021 inst->mlen++;
1022 }
1023 } else /* devinfo->gen == 4 */ {
1024 mrf = param_base;
1025 writemask = WRITEMASK_W;
1026 }
1027 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1028 } else if (op == ir_txf) {
1029 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1030 } else if (op == ir_txf_ms) {
1031 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1032 sample_index));
1033 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1034 /* MCS data is stored in the first two channels of ‘mcs’, but we
1035 * need to get it into the .y and .z channels of the second vec4
1036 * of params.
1037 */
1038 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1039 emit(MOV(dst_reg(MRF, param_base + 1,
1040 glsl_type::uint_type, WRITEMASK_YZ),
1041 mcs));
1042 } else if (devinfo->gen >= 7) {
1043 /* MCS data is in the first channel of `mcs`, but we need to get it into
1044 * the .y channel of the second vec4 of params, so replicate .x across
1045 * the whole vec4 and then mask off everything except .y
1046 */
1047 mcs.swizzle = BRW_SWIZZLE_XXXX;
1048 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1049 mcs));
1050 }
1051 inst->mlen++;
1052 } else if (op == ir_txd) {
1053 const brw_reg_type type = lod.type;
1054
1055 if (devinfo->gen >= 5) {
1056 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1057 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1059 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1060 inst->mlen++;
1061
1062 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1063 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1064 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1065 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1066 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1067 inst->mlen++;
1068
1069 if (shadow_comparator.file != BAD_FILE) {
1070 emit(MOV(dst_reg(MRF, param_base + 2,
1071 shadow_comparator.type, WRITEMASK_Z),
1072 shadow_comparator));
1073 }
1074 }
1075 } else /* devinfo->gen == 4 */ {
1076 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1077 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1078 inst->mlen += 2;
1079 }
1080 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1081 if (shadow_comparator.file != BAD_FILE) {
1082 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1083 shadow_comparator));
1084 }
1085
1086 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1087 offset_value));
1088 inst->mlen++;
1089 }
1090 }
1091
1092 emit(inst);
1093
1094 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1095 * spec requires layers.
1096 */
1097 if (op == ir_txs && devinfo->gen < 7) {
1098 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1099 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1100 src_reg(inst->dst), brw_imm_d(1));
1101 }
1102
1103 if (devinfo->gen == 6 && op == ir_tg4) {
1104 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1105 }
1106
1107 if (op == ir_query_levels) {
1108 /* # levels is in .w */
1109 src_reg swizzled(dest);
1110 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1111 SWIZZLE_W, SWIZZLE_W);
1112 emit(MOV(dest, swizzled));
1113 }
1114 }
1115
1116 /**
1117 * Apply workarounds for Gen6 gather with UINT/SINT
1118 */
1119 void
1120 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1121 {
1122 if (!wa)
1123 return;
1124
1125 int width = (wa & WA_8BIT) ? 8 : 16;
1126 dst_reg dst_f = dst;
1127 dst_f.type = BRW_REGISTER_TYPE_F;
1128
1129 /* Convert from UNORM to UINT */
1130 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1131 emit(MOV(dst, src_reg(dst_f)));
1132
1133 if (wa & WA_SIGN) {
1134 /* Reinterpret the UINT value as a signed INT value by
1135 * shifting the sign bit into place, then shifting back
1136 * preserving sign.
1137 */
1138 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1139 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1140 }
1141 }
1142
1143 void
1144 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1145 {
1146 unreachable("not reached");
1147 }
1148
1149 void
1150 vec4_visitor::gs_end_primitive()
1151 {
1152 unreachable("not reached");
1153 }
1154
1155 void
1156 vec4_visitor::emit_ndc_computation()
1157 {
1158 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1159 return;
1160
1161 /* Get the position */
1162 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1163
1164 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1165 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1166 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1167 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1168
1169 current_annotation = "NDC";
1170 dst_reg ndc_w = ndc;
1171 ndc_w.writemask = WRITEMASK_W;
1172 src_reg pos_w = pos;
1173 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1174 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1175
1176 dst_reg ndc_xyz = ndc;
1177 ndc_xyz.writemask = WRITEMASK_XYZ;
1178
1179 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1180 }
1181
1182 void
1183 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1184 {
1185 if (devinfo->gen < 6 &&
1186 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1187 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1188 devinfo->has_negative_rhw_bug)) {
1189 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1190 dst_reg header1_w = header1;
1191 header1_w.writemask = WRITEMASK_W;
1192
1193 emit(MOV(header1, brw_imm_ud(0u)));
1194
1195 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1196 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1197
1198 current_annotation = "Point size";
1199 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1200 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1201 }
1202
1203 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1204 current_annotation = "Clipping flags";
1205 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1206
1207 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1208 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1209 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1210 }
1211
1212 if (output_reg[VARYING_SLOT_CLIP_DIST1][0].file != BAD_FILE) {
1213 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1214 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1215 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1216 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1217 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1218 }
1219
1220 /* i965 clipping workaround:
1221 * 1) Test for -ve rhw
1222 * 2) If set,
1223 * set ndc = (0,0,0,0)
1224 * set ucp[6] = 1
1225 *
1226 * Later, clipping will detect ucp[6] and ensure the primitive is
1227 * clipped against all fixed planes.
1228 */
1229 if (devinfo->has_negative_rhw_bug &&
1230 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1231 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1232 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1233 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234 vec4_instruction *inst;
1235 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1236 inst->predicate = BRW_PREDICATE_NORMAL;
1237 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1238 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1239 inst->predicate = BRW_PREDICATE_NORMAL;
1240 }
1241
1242 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1243 } else if (devinfo->gen < 6) {
1244 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1245 } else {
1246 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1247 if (output_reg[VARYING_SLOT_PSIZ][0].file != BAD_FILE) {
1248 dst_reg reg_w = reg;
1249 reg_w.writemask = WRITEMASK_W;
1250 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1251 reg_as_src.type = reg_w.type;
1252 reg_as_src.swizzle = brw_swizzle_for_size(1);
1253 emit(MOV(reg_w, reg_as_src));
1254 }
1255 if (output_reg[VARYING_SLOT_LAYER][0].file != BAD_FILE) {
1256 dst_reg reg_y = reg;
1257 reg_y.writemask = WRITEMASK_Y;
1258 reg_y.type = BRW_REGISTER_TYPE_D;
1259 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1260 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1261 }
1262 if (output_reg[VARYING_SLOT_VIEWPORT][0].file != BAD_FILE) {
1263 dst_reg reg_z = reg;
1264 reg_z.writemask = WRITEMASK_Z;
1265 reg_z.type = BRW_REGISTER_TYPE_D;
1266 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1267 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1268 }
1269 }
1270 }
1271
1272 vec4_instruction *
1273 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1274 {
1275 assert(varying < VARYING_SLOT_MAX);
1276
1277 unsigned num_comps = output_num_components[varying][component];
1278 if (num_comps == 0)
1279 return NULL;
1280
1281 assert(output_reg[varying][component].type == reg.type);
1282 current_annotation = output_reg_annotation[varying];
1283 if (output_reg[varying][component].file != BAD_FILE) {
1284 src_reg src = src_reg(output_reg[varying][component]);
1285 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1286 reg.writemask =
1287 brw_writemask_for_component_packing(num_comps, component);
1288 return emit(MOV(reg, src));
1289 }
1290 return NULL;
1291 }
1292
1293 void
1294 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1295 {
1296 reg.type = BRW_REGISTER_TYPE_F;
1297 output_reg[varying][0].type = reg.type;
1298
1299 switch (varying) {
1300 case VARYING_SLOT_PSIZ:
1301 {
1302 /* PSIZ is always in slot 0, and is coupled with other flags. */
1303 current_annotation = "indices, point width, clip flags";
1304 emit_psiz_and_flags(reg);
1305 break;
1306 }
1307 case BRW_VARYING_SLOT_NDC:
1308 current_annotation = "NDC";
1309 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1310 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1311 break;
1312 case VARYING_SLOT_POS:
1313 current_annotation = "gl_Position";
1314 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1315 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1316 break;
1317 case VARYING_SLOT_EDGE: {
1318 /* This is present when doing unfilled polygons. We're supposed to copy
1319 * the edge flag from the user-provided vertex array
1320 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1321 * of that attribute (starts as 1.0f). This is then used in clipping to
1322 * determine which edges should be drawn as wireframe.
1323 */
1324 current_annotation = "edge flag";
1325 int edge_attr = util_bitcount64(nir->info.inputs_read &
1326 BITFIELD64_MASK(VERT_ATTRIB_EDGEFLAG));
1327 emit(MOV(reg, src_reg(dst_reg(ATTR, edge_attr,
1328 glsl_type::float_type, WRITEMASK_XYZW))));
1329 break;
1330 }
1331 case BRW_VARYING_SLOT_PAD:
1332 /* No need to write to this slot */
1333 break;
1334 default:
1335 for (int i = 0; i < 4; i++) {
1336 emit_generic_urb_slot(reg, varying, i);
1337 }
1338 break;
1339 }
1340 }
1341
1342 static unsigned
1343 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, unsigned mlen)
1344 {
1345 if (devinfo->gen >= 6) {
1346 /* URB data written (does not include the message header reg) must
1347 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1348 * section 5.4.3.2.2: URB_INTERLEAVED.
1349 *
1350 * URB entries are allocated on a multiple of 1024 bits, so an
1351 * extra 128 bits written here to make the end align to 256 is
1352 * no problem.
1353 */
1354 if ((mlen % 2) != 1)
1355 mlen++;
1356 }
1357
1358 return mlen;
1359 }
1360
1361
1362 /**
1363 * Generates the VUE payload plus the necessary URB write instructions to
1364 * output it.
1365 *
1366 * The VUE layout is documented in Volume 2a.
1367 */
1368 void
1369 vec4_visitor::emit_vertex()
1370 {
1371 /* MRF 0 is reserved for the debugger, so start with message header
1372 * in MRF 1.
1373 */
1374 int base_mrf = 1;
1375 int mrf = base_mrf;
1376 /* In the process of generating our URB write message contents, we
1377 * may need to unspill a register or load from an array. Those
1378 * reads would use MRFs 14-15.
1379 */
1380 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1381
1382 /* The following assertion verifies that max_usable_mrf causes an
1383 * even-numbered amount of URB write data, which will meet gen6's
1384 * requirements for length alignment.
1385 */
1386 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1387
1388 /* First mrf is the g0-based message header containing URB handles and
1389 * such.
1390 */
1391 emit_urb_write_header(mrf++);
1392
1393 if (devinfo->gen < 6) {
1394 emit_ndc_computation();
1395 }
1396
1397 /* We may need to split this up into several URB writes, so do them in a
1398 * loop.
1399 */
1400 int slot = 0;
1401 bool complete = false;
1402 do {
1403 /* URB offset is in URB row increments, and each of our MRFs is half of
1404 * one of those, since we're doing interleaved writes.
1405 */
1406 int offset = slot / 2;
1407
1408 mrf = base_mrf + 1;
1409 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1410 emit_urb_slot(dst_reg(MRF, mrf++),
1411 prog_data->vue_map.slot_to_varying[slot]);
1412
1413 /* If this was max_usable_mrf, we can't fit anything more into this
1414 * URB WRITE. Same thing if we reached the maximum length available.
1415 */
1416 if (mrf > max_usable_mrf ||
1417 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1418 slot++;
1419 break;
1420 }
1421 }
1422
1423 complete = slot >= prog_data->vue_map.num_slots;
1424 current_annotation = "URB write";
1425 vec4_instruction *inst = emit_urb_write_opcode(complete);
1426 inst->base_mrf = base_mrf;
1427 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1428 inst->offset += offset;
1429 } while(!complete);
1430 }
1431
1432
1433 src_reg
1434 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1435 src_reg *reladdr, int reg_offset)
1436 {
1437 /* Because we store the values to scratch interleaved like our
1438 * vertex data, we need to scale the vec4 index by 2.
1439 */
1440 int message_header_scale = 2;
1441
1442 /* Pre-gen6, the message header uses byte offsets instead of vec4
1443 * (16-byte) offset units.
1444 */
1445 if (devinfo->gen < 6)
1446 message_header_scale *= 16;
1447
1448 if (reladdr) {
1449 /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
1450 * to multiply the reladdr by 2. Notice that the reg_offset part
1451 * is in units of 16 bytes and is used to select the low/high 16-byte
1452 * chunk of a full dvec4, so we don't want to multiply that part.
1453 */
1454 src_reg index = src_reg(this, glsl_type::int_type);
1455 if (type_sz(inst->dst.type) < 8) {
1456 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1457 brw_imm_d(reg_offset)));
1458 emit_before(block, inst, MUL(dst_reg(index), index,
1459 brw_imm_d(message_header_scale)));
1460 } else {
1461 emit_before(block, inst, MUL(dst_reg(index), *reladdr,
1462 brw_imm_d(message_header_scale * 2)));
1463 emit_before(block, inst, ADD(dst_reg(index), index,
1464 brw_imm_d(reg_offset * message_header_scale)));
1465 }
1466 return index;
1467 } else {
1468 return brw_imm_d(reg_offset * message_header_scale);
1469 }
1470 }
1471
1472 /**
1473 * Emits an instruction before @inst to load the value named by @orig_src
1474 * from scratch space at @base_offset to @temp.
1475 *
1476 * @base_offset is measured in 32-byte units (the size of a register).
1477 */
1478 void
1479 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1480 dst_reg temp, src_reg orig_src,
1481 int base_offset)
1482 {
1483 assert(orig_src.offset % REG_SIZE == 0);
1484 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1485 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1486 reg_offset);
1487
1488 if (type_sz(orig_src.type) < 8) {
1489 emit_before(block, inst, SCRATCH_READ(temp, index));
1490 } else {
1491 dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
1492 dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
1493 emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
1494 index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
1495 vec4_instruction *last_read =
1496 SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
1497 emit_before(block, inst, last_read);
1498 shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
1499 }
1500 }
1501
1502 /**
1503 * Emits an instruction after @inst to store the value to be written
1504 * to @orig_dst to scratch space at @base_offset, from @temp.
1505 *
1506 * @base_offset is measured in 32-byte units (the size of a register).
1507 */
1508 void
1509 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1510 int base_offset)
1511 {
1512 assert(inst->dst.offset % REG_SIZE == 0);
1513 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1514 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1515 reg_offset);
1516
1517 /* Create a temporary register to store *inst's result in.
1518 *
1519 * We have to be careful in MOVing from our temporary result register in
1520 * the scratch write. If we swizzle from channels of the temporary that
1521 * weren't initialized, it will confuse live interval analysis, which will
1522 * make spilling fail to make progress.
1523 */
1524 bool is_64bit = type_sz(inst->dst.type) == 8;
1525 const glsl_type *alloc_type =
1526 is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
1527 const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
1528 inst->dst.type),
1529 brw_swizzle_for_mask(inst->dst.writemask));
1530
1531 if (!is_64bit) {
1532 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1533 inst->dst.writemask));
1534 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1535 if (inst->opcode != BRW_OPCODE_SEL)
1536 write->predicate = inst->predicate;
1537 write->ir = inst->ir;
1538 write->annotation = inst->annotation;
1539 inst->insert_after(block, write);
1540 } else {
1541 dst_reg shuffled = dst_reg(this, alloc_type);
1542 vec4_instruction *last =
1543 shuffle_64bit_data(shuffled, temp, true, block, inst);
1544 src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
1545
1546 uint8_t mask = 0;
1547 if (inst->dst.writemask & WRITEMASK_X)
1548 mask |= WRITEMASK_XY;
1549 if (inst->dst.writemask & WRITEMASK_Y)
1550 mask |= WRITEMASK_ZW;
1551 if (mask) {
1552 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1553
1554 vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
1555 if (inst->opcode != BRW_OPCODE_SEL)
1556 write->predicate = inst->predicate;
1557 write->ir = inst->ir;
1558 write->annotation = inst->annotation;
1559 last->insert_after(block, write);
1560 }
1561
1562 mask = 0;
1563 if (inst->dst.writemask & WRITEMASK_Z)
1564 mask |= WRITEMASK_XY;
1565 if (inst->dst.writemask & WRITEMASK_W)
1566 mask |= WRITEMASK_ZW;
1567 if (mask) {
1568 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
1569
1570 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1571 reg_offset + 1);
1572 vec4_instruction *write =
1573 SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
1574 if (inst->opcode != BRW_OPCODE_SEL)
1575 write->predicate = inst->predicate;
1576 write->ir = inst->ir;
1577 write->annotation = inst->annotation;
1578 last->insert_after(block, write);
1579 }
1580 }
1581
1582 inst->dst.file = temp.file;
1583 inst->dst.nr = temp.nr;
1584 inst->dst.offset %= REG_SIZE;
1585 inst->dst.reladdr = NULL;
1586 }
1587
1588 /**
1589 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1590 * adds the scratch read(s) before \p inst. The function also checks for
1591 * recursive reladdr scratch accesses, issuing the corresponding scratch
1592 * loads and rewriting reladdr references accordingly.
1593 *
1594 * \return \p src if it did not require a scratch load, otherwise, the
1595 * register holding the result of the scratch load that the caller should
1596 * use to rewrite src.
1597 */
1598 src_reg
1599 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1600 vec4_instruction *inst, src_reg src)
1601 {
1602 /* Resolve recursive reladdr scratch access by calling ourselves
1603 * with src.reladdr
1604 */
1605 if (src.reladdr)
1606 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1607 *src.reladdr);
1608
1609 /* Now handle scratch access on src */
1610 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1611 dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
1612 glsl_type::dvec4_type : glsl_type::vec4_type);
1613 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1614 src.nr = temp.nr;
1615 src.offset %= REG_SIZE;
1616 src.reladdr = NULL;
1617 }
1618
1619 return src;
1620 }
1621
1622 /**
1623 * We can't generally support array access in GRF space, because a
1624 * single instruction's destination can only span 2 contiguous
1625 * registers. So, we send all GRF arrays that get variable index
1626 * access to scratch space.
1627 */
1628 void
1629 vec4_visitor::move_grf_array_access_to_scratch()
1630 {
1631 int scratch_loc[this->alloc.count];
1632 memset(scratch_loc, -1, sizeof(scratch_loc));
1633
1634 /* First, calculate the set of virtual GRFs that need to be punted
1635 * to scratch due to having any array access on them, and where in
1636 * scratch.
1637 */
1638 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1639 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1640 if (scratch_loc[inst->dst.nr] == -1) {
1641 scratch_loc[inst->dst.nr] = last_scratch;
1642 last_scratch += this->alloc.sizes[inst->dst.nr];
1643 }
1644
1645 for (src_reg *iter = inst->dst.reladdr;
1646 iter->reladdr;
1647 iter = iter->reladdr) {
1648 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1649 scratch_loc[iter->nr] = last_scratch;
1650 last_scratch += this->alloc.sizes[iter->nr];
1651 }
1652 }
1653 }
1654
1655 for (int i = 0 ; i < 3; i++) {
1656 for (src_reg *iter = &inst->src[i];
1657 iter->reladdr;
1658 iter = iter->reladdr) {
1659 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1660 scratch_loc[iter->nr] = last_scratch;
1661 last_scratch += this->alloc.sizes[iter->nr];
1662 }
1663 }
1664 }
1665 }
1666
1667 /* Now, for anything that will be accessed through scratch, rewrite
1668 * it to load/store. Note that this is a _safe list walk, because
1669 * we may generate a new scratch_write instruction after the one
1670 * we're processing.
1671 */
1672 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1673 /* Set up the annotation tracking for new generated instructions. */
1674 base_ir = inst->ir;
1675 current_annotation = inst->annotation;
1676
1677 /* First handle scratch access on the dst. Notice we have to handle
1678 * the case where the dst's reladdr also points to scratch space.
1679 */
1680 if (inst->dst.reladdr)
1681 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1682 *inst->dst.reladdr);
1683
1684 /* Now that we have handled any (possibly recursive) reladdr scratch
1685 * accesses for dst we can safely do the scratch write for dst itself
1686 */
1687 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1688 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1689
1690 /* Now handle scratch access on any src. In this case, since inst->src[i]
1691 * already is a src_reg, we can just call emit_resolve_reladdr with
1692 * inst->src[i] and it will take care of handling scratch loads for
1693 * both src and src.reladdr (recursively).
1694 */
1695 for (int i = 0 ; i < 3; i++) {
1696 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1697 inst->src[i]);
1698 }
1699 }
1700 }
1701
1702 /**
1703 * Emits an instruction before @inst to load the value named by @orig_src
1704 * from the pull constant buffer (surface) at @base_offset to @temp.
1705 */
1706 void
1707 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1708 dst_reg temp, src_reg orig_src,
1709 int base_offset, src_reg indirect)
1710 {
1711 assert(orig_src.offset % 16 == 0);
1712 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1713
1714 /* For 64bit loads we need to emit two 32-bit load messages and we also
1715 * we need to shuffle the 32-bit data result into proper 64-bit data. To do
1716 * that we emit the 32-bit loads into a temporary and we shuffle the result
1717 * into the original destination.
1718 */
1719 dst_reg orig_temp = temp;
1720 bool is_64bit = type_sz(orig_src.type) == 8;
1721 if (is_64bit) {
1722 assert(type_sz(temp.type) == 8);
1723 dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
1724 temp = retype(temp_df, BRW_REGISTER_TYPE_F);
1725 }
1726
1727 src_reg src = orig_src;
1728 for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
1729 int reg_offset = base_offset + src.offset / 16;
1730
1731 src_reg offset;
1732 if (indirect.file != BAD_FILE) {
1733 offset = src_reg(this, glsl_type::uint_type);
1734 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1735 brw_imm_ud(reg_offset * 16)));
1736 } else if (devinfo->gen >= 8) {
1737 /* Store the offset in a GRF so we can send-from-GRF. */
1738 offset = src_reg(this, glsl_type::uint_type);
1739 emit_before(block, inst, MOV(dst_reg(offset),
1740 brw_imm_ud(reg_offset * 16)));
1741 } else {
1742 offset = brw_imm_d(reg_offset * 16);
1743 }
1744
1745 emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
1746 brw_imm_ud(index),
1747 offset,
1748 block, inst);
1749
1750 src = byte_offset(src, 16);
1751 }
1752
1753 if (is_64bit) {
1754 temp = retype(temp, BRW_REGISTER_TYPE_DF);
1755 shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
1756 }
1757 }
1758
1759 /**
1760 * Implements array access of uniforms by inserting a
1761 * PULL_CONSTANT_LOAD instruction.
1762 *
1763 * Unlike temporary GRF array access (where we don't support it due to
1764 * the difficulty of doing relative addressing on instruction
1765 * destinations), we could potentially do array access of uniforms
1766 * that were loaded in GRF space as push constants. In real-world
1767 * usage we've seen, though, the arrays being used are always larger
1768 * than we could load as push constants, so just always move all
1769 * uniform array access out to a pull constant buffer.
1770 */
1771 void
1772 vec4_visitor::move_uniform_array_access_to_pull_constants()
1773 {
1774 /* The vulkan dirver doesn't support pull constants other than UBOs so
1775 * everything has to be pushed regardless.
1776 */
1777 if (!compiler->supports_pull_constants) {
1778 split_uniform_registers();
1779 return;
1780 }
1781
1782 /* Allocate the pull_params array */
1783 assert(stage_prog_data->nr_pull_params == 0);
1784 stage_prog_data->pull_param = ralloc_array(mem_ctx, uint32_t,
1785 this->uniforms * 4);
1786
1787 int pull_constant_loc[this->uniforms];
1788 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1789
1790 /* First, walk through the instructions and determine which things need to
1791 * be pulled. We mark something as needing to be pulled by setting
1792 * pull_constant_loc to 0.
1793 */
1794 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1795 /* We only care about MOV_INDIRECT of a uniform */
1796 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1797 inst->src[0].file != UNIFORM)
1798 continue;
1799
1800 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1801
1802 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1803 pull_constant_loc[uniform_nr + j] = 0;
1804 }
1805
1806 /* Next, we walk the list of uniforms and assign real pull constant
1807 * locations and set their corresponding entries in pull_param.
1808 */
1809 for (int j = 0; j < this->uniforms; j++) {
1810 if (pull_constant_loc[j] < 0)
1811 continue;
1812
1813 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1814
1815 for (int i = 0; i < 4; i++) {
1816 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1817 = stage_prog_data->param[j * 4 + i];
1818 }
1819 }
1820
1821 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1822 * instructions to actual uniform pulls.
1823 */
1824 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1825 /* We only care about MOV_INDIRECT of a uniform */
1826 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1827 inst->src[0].file != UNIFORM)
1828 continue;
1829
1830 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1831
1832 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1833
1834 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1835 pull_constant_loc[uniform_nr], inst->src[1]);
1836 inst->remove(block);
1837 }
1838
1839 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1840 * no need to track them as larger-than-vec4 objects. This will be
1841 * relied on in cutting out unused uniform vectors from push
1842 * constants.
1843 */
1844 split_uniform_registers();
1845 }
1846
1847 void
1848 vec4_visitor::resolve_ud_negate(src_reg *reg)
1849 {
1850 if (reg->type != BRW_REGISTER_TYPE_UD ||
1851 !reg->negate)
1852 return;
1853
1854 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1855 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1856 *reg = temp;
1857 }
1858
1859 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1860 void *log_data,
1861 const struct brw_sampler_prog_key_data *key_tex,
1862 struct brw_vue_prog_data *prog_data,
1863 const nir_shader *shader,
1864 void *mem_ctx,
1865 bool no_spills,
1866 int shader_time_index)
1867 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1868 key_tex(key_tex),
1869 prog_data(prog_data),
1870 fail_msg(NULL),
1871 first_non_payload_grf(0),
1872 need_all_constants_in_pull_buffer(false),
1873 no_spills(no_spills),
1874 shader_time_index(shader_time_index),
1875 last_scratch(0)
1876 {
1877 this->failed = false;
1878
1879 this->base_ir = NULL;
1880 this->current_annotation = NULL;
1881 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1882
1883 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1884
1885 this->virtual_grf_start = NULL;
1886 this->virtual_grf_end = NULL;
1887 this->live_intervals = NULL;
1888
1889 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1890
1891 this->uniforms = 0;
1892 }
1893
1894
1895 void
1896 vec4_visitor::fail(const char *format, ...)
1897 {
1898 va_list va;
1899 char *msg;
1900
1901 if (failed)
1902 return;
1903
1904 failed = true;
1905
1906 va_start(va, format);
1907 msg = ralloc_vasprintf(mem_ctx, format, va);
1908 va_end(va);
1909 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1910
1911 this->fail_msg = msg;
1912
1913 if (debug_enabled) {
1914 fprintf(stderr, "%s", msg);
1915 }
1916 }
1917
1918 } /* namespace brw */