i965: Enable EXT_shader_samples_identical
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240
241 resolve_ud_negate(&src0);
242 resolve_ud_negate(&src1);
243
244 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
245 inst->conditional_mod = condition;
246
247 return inst;
248 }
249
250 vec4_instruction *
251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
252 {
253 vec4_instruction *inst;
254
255 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
256 dst, index);
257 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
258 inst->mlen = 2;
259
260 return inst;
261 }
262
263 vec4_instruction *
264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
265 const src_reg &index)
266 {
267 vec4_instruction *inst;
268
269 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
270 dst, src, index);
271 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
272 inst->mlen = 3;
273
274 return inst;
275 }
276
277 src_reg
278 vec4_visitor::fix_3src_operand(const src_reg &src)
279 {
280 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
281 * able to use vertical stride of zero to replicate the vec4 uniform, like
282 *
283 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
284 *
285 * But you can't, since vertical stride is always four in three-source
286 * instructions. Instead, insert a MOV instruction to do the replication so
287 * that the three-source instruction can consume it.
288 */
289
290 /* The MOV is only needed if the source is a uniform or immediate. */
291 if (src.file != UNIFORM && src.file != IMM)
292 return src;
293
294 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
295 return src;
296
297 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
298 expanded.type = src.type;
299 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
300 return src_reg(expanded);
301 }
302
303 src_reg
304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
305 {
306 if (!src.abs && !src.negate)
307 return src;
308
309 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
310 resolved.type = src.type;
311 emit(MOV(resolved, src));
312
313 return src_reg(resolved);
314 }
315
316 src_reg
317 vec4_visitor::fix_math_operand(const src_reg &src)
318 {
319 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
320 return src;
321
322 /* The gen6 math instruction ignores the source modifiers --
323 * swizzle, abs, negate, and at least some parts of the register
324 * region description.
325 *
326 * Rather than trying to enumerate all these cases, *always* expand the
327 * operand to a temp GRF for gen6.
328 *
329 * For gen7, keep the operand as-is, except if immediate, which gen7 still
330 * can't use.
331 */
332
333 if (devinfo->gen == 7 && src.file != IMM)
334 return src;
335
336 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
337 expanded.type = src.type;
338 emit(MOV(expanded, src));
339 return src_reg(expanded);
340 }
341
342 vec4_instruction *
343 vec4_visitor::emit_math(enum opcode opcode,
344 const dst_reg &dst,
345 const src_reg &src0, const src_reg &src1)
346 {
347 vec4_instruction *math =
348 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
349
350 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
351 /* MATH on Gen6 must be align1, so we can't do writemasks. */
352 math->dst = dst_reg(this, glsl_type::vec4_type);
353 math->dst.type = dst.type;
354 math = emit(MOV(dst, src_reg(math->dst)));
355 } else if (devinfo->gen < 6) {
356 math->base_mrf = 1;
357 math->mlen = src1.file == BAD_FILE ? 1 : 2;
358 }
359
360 return math;
361 }
362
363 void
364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
365 {
366 if (devinfo->gen < 7) {
367 unreachable("ir_unop_pack_half_2x16 should be lowered");
368 }
369
370 assert(dst.type == BRW_REGISTER_TYPE_UD);
371 assert(src0.type == BRW_REGISTER_TYPE_F);
372
373 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
374 *
375 * Because this instruction does not have a 16-bit floating-point type,
376 * the destination data type must be Word (W).
377 *
378 * The destination must be DWord-aligned and specify a horizontal stride
379 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
380 * each destination channel and the upper word is not modified.
381 *
382 * The above restriction implies that the f32to16 instruction must use
383 * align1 mode, because only in align1 mode is it possible to specify
384 * horizontal stride. We choose here to defy the hardware docs and emit
385 * align16 instructions.
386 *
387 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
388 * instructions. I was partially successful in that the code passed all
389 * tests. However, the code was dubiously correct and fragile, and the
390 * tests were not harsh enough to probe that frailty. Not trusting the
391 * code, I chose instead to remain in align16 mode in defiance of the hw
392 * docs).
393 *
394 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
395 * simulator, emitting a f32to16 in align16 mode with UD as destination
396 * data type is safe. The behavior differs from that specified in the PRM
397 * in that the upper word of each destination channel is cleared to 0.
398 */
399
400 dst_reg tmp_dst(this, glsl_type::uvec2_type);
401 src_reg tmp_src(tmp_dst);
402
403 #if 0
404 /* Verify the undocumented behavior on which the following instructions
405 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
406 * then the result of the bit-or instruction below will be incorrect.
407 *
408 * You should inspect the disasm output in order to verify that the MOV is
409 * not optimized away.
410 */
411 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
412 #endif
413
414 /* Give tmp the form below, where "." means untouched.
415 *
416 * w z y x w z y x
417 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
418 *
419 * That the upper word of each write-channel be 0 is required for the
420 * following bit-shift and bit-or instructions to work. Note that this
421 * relies on the undocumented hardware behavior mentioned above.
422 */
423 tmp_dst.writemask = WRITEMASK_XY;
424 emit(F32TO16(tmp_dst, src0));
425
426 /* Give the write-channels of dst the form:
427 * 0xhhhh0000
428 */
429 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
430 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
431
432 /* Finally, give the write-channels of dst the form of packHalf2x16's
433 * output:
434 * 0xhhhhllll
435 */
436 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
437 emit(OR(dst, src_reg(dst), tmp_src));
438 }
439
440 void
441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
442 {
443 if (devinfo->gen < 7) {
444 unreachable("ir_unop_unpack_half_2x16 should be lowered");
445 }
446
447 assert(dst.type == BRW_REGISTER_TYPE_F);
448 assert(src0.type == BRW_REGISTER_TYPE_UD);
449
450 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
451 *
452 * Because this instruction does not have a 16-bit floating-point type,
453 * the source data type must be Word (W). The destination type must be
454 * F (Float).
455 *
456 * To use W as the source data type, we must adjust horizontal strides,
457 * which is only possible in align1 mode. All my [chadv] attempts at
458 * emitting align1 instructions for unpackHalf2x16 failed to pass the
459 * Piglit tests, so I gave up.
460 *
461 * I've verified that, on gen7 hardware and the simulator, it is safe to
462 * emit f16to32 in align16 mode with UD as source data type.
463 */
464
465 dst_reg tmp_dst(this, glsl_type::uvec2_type);
466 src_reg tmp_src(tmp_dst);
467
468 tmp_dst.writemask = WRITEMASK_X;
469 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
470
471 tmp_dst.writemask = WRITEMASK_Y;
472 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
473
474 dst.writemask = WRITEMASK_XY;
475 emit(F16TO32(dst, tmp_src));
476 }
477
478 void
479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
480 {
481 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
482 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
483 * is not suitable to generate the shift values, but we can use the packed
484 * vector float and a type-converting MOV.
485 */
486 dst_reg shift(this, glsl_type::uvec4_type);
487 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
488
489 dst_reg shifted(this, glsl_type::uvec4_type);
490 src0.swizzle = BRW_SWIZZLE_XXXX;
491 emit(SHR(shifted, src0, src_reg(shift)));
492
493 shifted.type = BRW_REGISTER_TYPE_UB;
494 dst_reg f(this, glsl_type::vec4_type);
495 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
496
497 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
498 }
499
500 void
501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
502 {
503 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
504 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
505 * is not suitable to generate the shift values, but we can use the packed
506 * vector float and a type-converting MOV.
507 */
508 dst_reg shift(this, glsl_type::uvec4_type);
509 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
510
511 dst_reg shifted(this, glsl_type::uvec4_type);
512 src0.swizzle = BRW_SWIZZLE_XXXX;
513 emit(SHR(shifted, src0, src_reg(shift)));
514
515 shifted.type = BRW_REGISTER_TYPE_B;
516 dst_reg f(this, glsl_type::vec4_type);
517 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
518
519 dst_reg scaled(this, glsl_type::vec4_type);
520 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
521
522 dst_reg max(this, glsl_type::vec4_type);
523 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
524 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
525 }
526
527 void
528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
529 {
530 dst_reg saturated(this, glsl_type::vec4_type);
531 vec4_instruction *inst = emit(MOV(saturated, src0));
532 inst->saturate = true;
533
534 dst_reg scaled(this, glsl_type::vec4_type);
535 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
536
537 dst_reg rounded(this, glsl_type::vec4_type);
538 emit(RNDE(rounded, src_reg(scaled)));
539
540 dst_reg u(this, glsl_type::uvec4_type);
541 emit(MOV(u, src_reg(rounded)));
542
543 src_reg bytes(u);
544 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
545 }
546
547 void
548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
549 {
550 dst_reg max(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
552
553 dst_reg min(this, glsl_type::vec4_type);
554 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
555
556 dst_reg scaled(this, glsl_type::vec4_type);
557 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
558
559 dst_reg rounded(this, glsl_type::vec4_type);
560 emit(RNDE(rounded, src_reg(scaled)));
561
562 dst_reg i(this, glsl_type::ivec4_type);
563 emit(MOV(i, src_reg(rounded)));
564
565 src_reg bytes(i);
566 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
567 }
568
569 /**
570 * Returns the minimum number of vec4 elements needed to pack a type.
571 *
572 * For simple types, it will return 1 (a single vec4); for matrices, the
573 * number of columns; for array and struct, the sum of the vec4_size of
574 * each of its elements; and for sampler and atomic, zero.
575 *
576 * This method is useful to calculate how much register space is needed to
577 * store a particular type.
578 */
579 extern "C" int
580 type_size_vec4(const struct glsl_type *type)
581 {
582 unsigned int i;
583 int size;
584
585 switch (type->base_type) {
586 case GLSL_TYPE_UINT:
587 case GLSL_TYPE_INT:
588 case GLSL_TYPE_FLOAT:
589 case GLSL_TYPE_BOOL:
590 if (type->is_matrix()) {
591 return type->matrix_columns;
592 } else {
593 /* Regardless of size of vector, it gets a vec4. This is bad
594 * packing for things like floats, but otherwise arrays become a
595 * mess. Hopefully a later pass over the code can pack scalars
596 * down if appropriate.
597 */
598 return 1;
599 }
600 case GLSL_TYPE_ARRAY:
601 assert(type->length > 0);
602 return type_size_vec4(type->fields.array) * type->length;
603 case GLSL_TYPE_STRUCT:
604 size = 0;
605 for (i = 0; i < type->length; i++) {
606 size += type_size_vec4(type->fields.structure[i].type);
607 }
608 return size;
609 case GLSL_TYPE_SUBROUTINE:
610 return 1;
611
612 case GLSL_TYPE_SAMPLER:
613 /* Samplers take up no register space, since they're baked in at
614 * link time.
615 */
616 return 0;
617 case GLSL_TYPE_ATOMIC_UINT:
618 return 0;
619 case GLSL_TYPE_IMAGE:
620 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
621 case GLSL_TYPE_VOID:
622 case GLSL_TYPE_DOUBLE:
623 case GLSL_TYPE_ERROR:
624 case GLSL_TYPE_INTERFACE:
625 unreachable("not reached");
626 }
627
628 return 0;
629 }
630
631 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
632 {
633 init();
634
635 this->file = VGRF;
636 this->nr = v->alloc.allocate(type_size_vec4(type));
637
638 if (type->is_array() || type->is_record()) {
639 this->swizzle = BRW_SWIZZLE_NOOP;
640 } else {
641 this->swizzle = brw_swizzle_for_size(type->vector_elements);
642 }
643
644 this->type = brw_type_for_base_type(type);
645 }
646
647 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
648 {
649 assert(size > 0);
650
651 init();
652
653 this->file = VGRF;
654 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
655
656 this->swizzle = BRW_SWIZZLE_NOOP;
657
658 this->type = brw_type_for_base_type(type);
659 }
660
661 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
662 {
663 init();
664
665 this->file = VGRF;
666 this->nr = v->alloc.allocate(type_size_vec4(type));
667
668 if (type->is_array() || type->is_record()) {
669 this->writemask = WRITEMASK_XYZW;
670 } else {
671 this->writemask = (1 << type->vector_elements) - 1;
672 }
673
674 this->type = brw_type_for_base_type(type);
675 }
676
677 vec4_instruction *
678 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
679 src_reg src0, src_reg src1)
680 {
681 vec4_instruction *inst;
682
683 if (devinfo->gen >= 6) {
684 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
685 inst->conditional_mod = conditionalmod;
686 } else {
687 emit(CMP(dst, src0, src1, conditionalmod));
688
689 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
690 inst->predicate = BRW_PREDICATE_NORMAL;
691 }
692
693 return inst;
694 }
695
696 vec4_instruction *
697 vec4_visitor::emit_lrp(const dst_reg &dst,
698 const src_reg &x, const src_reg &y, const src_reg &a)
699 {
700 if (devinfo->gen >= 6) {
701 /* Note that the instruction's argument order is reversed from GLSL
702 * and the IR.
703 */
704 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
705 fix_3src_operand(x)));
706 } else {
707 /* Earlier generations don't support three source operations, so we
708 * need to emit x*(1-a) + y*a.
709 */
710 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
711 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
712 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
713 y_times_a.writemask = dst.writemask;
714 one_minus_a.writemask = dst.writemask;
715 x_times_one_minus_a.writemask = dst.writemask;
716
717 emit(MUL(y_times_a, y, a));
718 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
719 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
720 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
721 }
722 }
723
724 /**
725 * Emits the instructions needed to perform a pull constant load. before_block
726 * and before_inst can be NULL in which case the instruction will be appended
727 * to the end of the instruction list.
728 */
729 void
730 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
731 src_reg surf_index,
732 src_reg offset_reg,
733 bblock_t *before_block,
734 vec4_instruction *before_inst)
735 {
736 assert((before_inst == NULL && before_block == NULL) ||
737 (before_inst && before_block));
738
739 vec4_instruction *pull;
740
741 if (devinfo->gen >= 9) {
742 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
743 src_reg header(this, glsl_type::uvec4_type, 2);
744
745 pull = new(mem_ctx)
746 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
747 dst_reg(header));
748
749 if (before_inst)
750 emit_before(before_block, before_inst, pull);
751 else
752 emit(pull);
753
754 dst_reg index_reg = retype(offset(dst_reg(header), 1),
755 offset_reg.type);
756 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
757
758 if (before_inst)
759 emit_before(before_block, before_inst, pull);
760 else
761 emit(pull);
762
763 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
764 dst,
765 surf_index,
766 header);
767 pull->mlen = 2;
768 pull->header_size = 1;
769 } else if (devinfo->gen >= 7) {
770 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
771
772 grf_offset.type = offset_reg.type;
773
774 pull = MOV(grf_offset, offset_reg);
775
776 if (before_inst)
777 emit_before(before_block, before_inst, pull);
778 else
779 emit(pull);
780
781 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
782 dst,
783 surf_index,
784 src_reg(grf_offset));
785 pull->mlen = 1;
786 } else {
787 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
788 dst,
789 surf_index,
790 offset_reg);
791 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
792 pull->mlen = 1;
793 }
794
795 if (before_inst)
796 emit_before(before_block, before_inst, pull);
797 else
798 emit(pull);
799 }
800
801 src_reg
802 vec4_visitor::emit_uniformize(const src_reg &src)
803 {
804 const src_reg chan_index(this, glsl_type::uint_type);
805 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
806 src.type);
807
808 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
809 ->force_writemask_all = true;
810 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
811 ->force_writemask_all = true;
812
813 return src_reg(dst);
814 }
815
816 src_reg
817 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
818 src_reg coordinate, src_reg sampler)
819 {
820 vec4_instruction *inst =
821 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
822 dst_reg(this, glsl_type::uvec4_type));
823 inst->base_mrf = 2;
824 inst->src[1] = sampler;
825
826 int param_base;
827
828 if (devinfo->gen >= 9) {
829 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
830 vec4_instruction *header_inst = new(mem_ctx)
831 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
832 dst_reg(MRF, inst->base_mrf));
833
834 emit(header_inst);
835
836 inst->mlen = 2;
837 inst->header_size = 1;
838 param_base = inst->base_mrf + 1;
839 } else {
840 inst->mlen = 1;
841 param_base = inst->base_mrf;
842 }
843
844 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
845 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
846 int zero_mask = 0xf & ~coord_mask;
847
848 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
849 coordinate));
850
851 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
852 brw_imm_d(0)));
853
854 emit(inst);
855 return src_reg(inst->dst);
856 }
857
858 bool
859 vec4_visitor::is_high_sampler(src_reg sampler)
860 {
861 if (devinfo->gen < 8 && !devinfo->is_haswell)
862 return false;
863
864 return sampler.file != IMM || sampler.ud >= 16;
865 }
866
867 void
868 vec4_visitor::emit_texture(ir_texture_opcode op,
869 dst_reg dest,
870 const glsl_type *dest_type,
871 src_reg coordinate,
872 int coord_components,
873 src_reg shadow_comparitor,
874 src_reg lod, src_reg lod2,
875 src_reg sample_index,
876 uint32_t constant_offset,
877 src_reg offset_value,
878 src_reg mcs,
879 bool is_cube_array,
880 uint32_t sampler,
881 src_reg sampler_reg)
882 {
883 /* The sampler can only meaningfully compute LOD for fragment shader
884 * messages. For all other stages, we change the opcode to TXL and hardcode
885 * the LOD to 0.
886 *
887 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
888 * valid LOD argument.
889 */
890 if (op == ir_tex || op == ir_query_levels) {
891 assert(lod.file == BAD_FILE);
892 lod = brw_imm_f(0.0f);
893 }
894
895 enum opcode opcode;
896 switch (op) {
897 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
898 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
899 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
900 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
901 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
902 SHADER_OPCODE_TXF_CMS); break;
903 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
904 case ir_tg4: opcode = offset_value.file != BAD_FILE
905 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
906 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
907 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
908 case ir_txb:
909 unreachable("TXB is not valid for vertex shaders.");
910 case ir_lod:
911 unreachable("LOD is not valid for vertex shaders.");
912 case ir_samples_identical: {
913 /* There are some challenges implementing this for vec4, and it seems
914 * unlikely to be used anyway. For now, just return false ways.
915 */
916 emit(MOV(dest, brw_imm_ud(0u)));
917 return;
918 }
919 default:
920 unreachable("Unrecognized tex op");
921 }
922
923 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
924 opcode, dst_reg(this, dest_type));
925
926 inst->offset = constant_offset;
927
928 /* The message header is necessary for:
929 * - Gen4 (always)
930 * - Gen9+ for selecting SIMD4x2
931 * - Texel offsets
932 * - Gather channel selection
933 * - Sampler indices too large to fit in a 4-bit value.
934 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
935 */
936 inst->header_size =
937 (devinfo->gen < 5 || devinfo->gen >= 9 ||
938 inst->offset != 0 || op == ir_tg4 ||
939 op == ir_texture_samples ||
940 is_high_sampler(sampler_reg)) ? 1 : 0;
941 inst->base_mrf = 2;
942 inst->mlen = inst->header_size;
943 inst->dst.writemask = WRITEMASK_XYZW;
944 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
945
946 inst->src[1] = sampler_reg;
947
948 /* MRF for the first parameter */
949 int param_base = inst->base_mrf + inst->header_size;
950
951 if (op == ir_txs || op == ir_query_levels) {
952 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
953 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
954 inst->mlen++;
955 } else if (op == ir_texture_samples) {
956 inst->dst.writemask = WRITEMASK_X;
957 } else {
958 /* Load the coordinate */
959 /* FINISHME: gl_clamp_mask and saturate */
960 int coord_mask = (1 << coord_components) - 1;
961 int zero_mask = 0xf & ~coord_mask;
962
963 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
964 coordinate));
965 inst->mlen++;
966
967 if (zero_mask != 0) {
968 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
969 brw_imm_d(0)));
970 }
971 /* Load the shadow comparitor */
972 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
973 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
974 WRITEMASK_X),
975 shadow_comparitor));
976 inst->mlen++;
977 }
978
979 /* Load the LOD info */
980 if (op == ir_tex || op == ir_txl) {
981 int mrf, writemask;
982 if (devinfo->gen >= 5) {
983 mrf = param_base + 1;
984 if (shadow_comparitor.file != BAD_FILE) {
985 writemask = WRITEMASK_Y;
986 /* mlen already incremented */
987 } else {
988 writemask = WRITEMASK_X;
989 inst->mlen++;
990 }
991 } else /* devinfo->gen == 4 */ {
992 mrf = param_base;
993 writemask = WRITEMASK_W;
994 }
995 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
996 } else if (op == ir_txf) {
997 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
998 } else if (op == ir_txf_ms) {
999 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1000 sample_index));
1001 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1002 /* MCS data is stored in the first two channels of ‘mcs’, but we
1003 * need to get it into the .y and .z channels of the second vec4
1004 * of params.
1005 */
1006 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1007 emit(MOV(dst_reg(MRF, param_base + 1,
1008 glsl_type::uint_type, WRITEMASK_YZ),
1009 mcs));
1010 } else if (devinfo->gen >= 7) {
1011 /* MCS data is in the first channel of `mcs`, but we need to get it into
1012 * the .y channel of the second vec4 of params, so replicate .x across
1013 * the whole vec4 and then mask off everything except .y
1014 */
1015 mcs.swizzle = BRW_SWIZZLE_XXXX;
1016 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1017 mcs));
1018 }
1019 inst->mlen++;
1020 } else if (op == ir_txd) {
1021 const brw_reg_type type = lod.type;
1022
1023 if (devinfo->gen >= 5) {
1024 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1025 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1026 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1027 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1028 inst->mlen++;
1029
1030 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1031 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1032 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1033 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1034 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1035 inst->mlen++;
1036
1037 if (shadow_comparitor.file != BAD_FILE) {
1038 emit(MOV(dst_reg(MRF, param_base + 2,
1039 shadow_comparitor.type, WRITEMASK_Z),
1040 shadow_comparitor));
1041 }
1042 }
1043 } else /* devinfo->gen == 4 */ {
1044 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1045 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1046 inst->mlen += 2;
1047 }
1048 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1049 if (shadow_comparitor.file != BAD_FILE) {
1050 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1051 shadow_comparitor));
1052 }
1053
1054 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1055 offset_value));
1056 inst->mlen++;
1057 }
1058 }
1059
1060 emit(inst);
1061
1062 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1063 * spec requires layers.
1064 */
1065 if (op == ir_txs && is_cube_array) {
1066 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1067 writemask(inst->dst, WRITEMASK_Z),
1068 src_reg(inst->dst), brw_imm_d(6));
1069 }
1070
1071 if (devinfo->gen == 6 && op == ir_tg4) {
1072 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1073 }
1074
1075 swizzle_result(op, dest,
1076 src_reg(inst->dst), sampler, dest_type);
1077 }
1078
1079 /**
1080 * Apply workarounds for Gen6 gather with UINT/SINT
1081 */
1082 void
1083 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1084 {
1085 if (!wa)
1086 return;
1087
1088 int width = (wa & WA_8BIT) ? 8 : 16;
1089 dst_reg dst_f = dst;
1090 dst_f.type = BRW_REGISTER_TYPE_F;
1091
1092 /* Convert from UNORM to UINT */
1093 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1094 emit(MOV(dst, src_reg(dst_f)));
1095
1096 if (wa & WA_SIGN) {
1097 /* Reinterpret the UINT value as a signed INT value by
1098 * shifting the sign bit into place, then shifting back
1099 * preserving sign.
1100 */
1101 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1102 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1103 }
1104 }
1105
1106 /**
1107 * Set up the gather channel based on the swizzle, for gather4.
1108 */
1109 uint32_t
1110 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1111 {
1112 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1113 switch (swiz) {
1114 case SWIZZLE_X: return 0;
1115 case SWIZZLE_Y:
1116 /* gather4 sampler is broken for green channel on RG32F --
1117 * we must ask for blue instead.
1118 */
1119 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1120 return 2;
1121 return 1;
1122 case SWIZZLE_Z: return 2;
1123 case SWIZZLE_W: return 3;
1124 default:
1125 unreachable("Not reached"); /* zero, one swizzles handled already */
1126 }
1127 }
1128
1129 void
1130 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1131 src_reg orig_val, uint32_t sampler,
1132 const glsl_type *dest_type)
1133 {
1134 int s = key_tex->swizzles[sampler];
1135
1136 dst_reg swizzled_result = dest;
1137
1138 if (op == ir_query_levels) {
1139 /* # levels is in .w */
1140 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1141 emit(MOV(swizzled_result, orig_val));
1142 return;
1143 }
1144
1145 if (op == ir_txs || dest_type == glsl_type::float_type
1146 || s == SWIZZLE_NOOP || op == ir_tg4) {
1147 emit(MOV(swizzled_result, orig_val));
1148 return;
1149 }
1150
1151
1152 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1153 int swizzle[4] = {0};
1154
1155 for (int i = 0; i < 4; i++) {
1156 switch (GET_SWZ(s, i)) {
1157 case SWIZZLE_ZERO:
1158 zero_mask |= (1 << i);
1159 break;
1160 case SWIZZLE_ONE:
1161 one_mask |= (1 << i);
1162 break;
1163 default:
1164 copy_mask |= (1 << i);
1165 swizzle[i] = GET_SWZ(s, i);
1166 break;
1167 }
1168 }
1169
1170 if (copy_mask) {
1171 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1172 swizzled_result.writemask = copy_mask;
1173 emit(MOV(swizzled_result, orig_val));
1174 }
1175
1176 if (zero_mask) {
1177 swizzled_result.writemask = zero_mask;
1178 emit(MOV(swizzled_result, brw_imm_f(0.0f)));
1179 }
1180
1181 if (one_mask) {
1182 swizzled_result.writemask = one_mask;
1183 emit(MOV(swizzled_result, brw_imm_f(1.0f)));
1184 }
1185 }
1186
1187 void
1188 vec4_visitor::gs_emit_vertex(int stream_id)
1189 {
1190 unreachable("not reached");
1191 }
1192
1193 void
1194 vec4_visitor::gs_end_primitive()
1195 {
1196 unreachable("not reached");
1197 }
1198
1199 void
1200 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1201 dst_reg dst, src_reg surf_offset,
1202 src_reg src0, src_reg src1)
1203 {
1204 unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1205 src_reg src_payload(this, glsl_type::uint_type, mlen);
1206 dst_reg payload(src_payload);
1207 payload.writemask = WRITEMASK_X;
1208
1209 /* Set the atomic operation offset. */
1210 emit(MOV(offset(payload, 0), surf_offset));
1211 unsigned i = 1;
1212
1213 /* Set the atomic operation arguments. */
1214 if (src0.file != BAD_FILE) {
1215 emit(MOV(offset(payload, i), src0));
1216 i++;
1217 }
1218
1219 if (src1.file != BAD_FILE) {
1220 emit(MOV(offset(payload, i), src1));
1221 i++;
1222 }
1223
1224 /* Emit the instruction. Note that this maps to the normal SIMD8
1225 * untyped atomic message on Ivy Bridge, but that's OK because
1226 * unused channels will be masked out.
1227 */
1228 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1229 src_payload,
1230 brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1231 inst->mlen = mlen;
1232 }
1233
1234 void
1235 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1236 src_reg surf_offset)
1237 {
1238 dst_reg offset(this, glsl_type::uint_type);
1239 offset.writemask = WRITEMASK_X;
1240
1241 /* Set the surface read offset. */
1242 emit(MOV(offset, surf_offset));
1243
1244 /* Emit the instruction. Note that this maps to the normal SIMD8
1245 * untyped surface read message, but that's OK because unused
1246 * channels will be masked out.
1247 */
1248 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1249 src_reg(offset),
1250 brw_imm_ud(surf_index), brw_imm_d(1));
1251 inst->mlen = 1;
1252 }
1253
1254 void
1255 vec4_visitor::emit_ndc_computation()
1256 {
1257 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1258 return;
1259
1260 /* Get the position */
1261 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1262
1263 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1264 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1265 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1266
1267 current_annotation = "NDC";
1268 dst_reg ndc_w = ndc;
1269 ndc_w.writemask = WRITEMASK_W;
1270 src_reg pos_w = pos;
1271 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1272 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1273
1274 dst_reg ndc_xyz = ndc;
1275 ndc_xyz.writemask = WRITEMASK_XYZ;
1276
1277 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1278 }
1279
1280 void
1281 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1282 {
1283 if (devinfo->gen < 6 &&
1284 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1285 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1286 devinfo->has_negative_rhw_bug)) {
1287 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1288 dst_reg header1_w = header1;
1289 header1_w.writemask = WRITEMASK_W;
1290
1291 emit(MOV(header1, brw_imm_ud(0u)));
1292
1293 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1294 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1295
1296 current_annotation = "Point size";
1297 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1298 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1299 }
1300
1301 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1302 current_annotation = "Clipping flags";
1303 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1304 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1305
1306 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1307 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1308 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1309
1310 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1311 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1312 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1313 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1314 }
1315
1316 /* i965 clipping workaround:
1317 * 1) Test for -ve rhw
1318 * 2) If set,
1319 * set ndc = (0,0,0,0)
1320 * set ucp[6] = 1
1321 *
1322 * Later, clipping will detect ucp[6] and ensure the primitive is
1323 * clipped against all fixed planes.
1324 */
1325 if (devinfo->has_negative_rhw_bug &&
1326 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1327 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1328 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1329 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1330 vec4_instruction *inst;
1331 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1332 inst->predicate = BRW_PREDICATE_NORMAL;
1333 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1334 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1335 inst->predicate = BRW_PREDICATE_NORMAL;
1336 }
1337
1338 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1339 } else if (devinfo->gen < 6) {
1340 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1341 } else {
1342 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1343 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1344 dst_reg reg_w = reg;
1345 reg_w.writemask = WRITEMASK_W;
1346 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1347 reg_as_src.type = reg_w.type;
1348 reg_as_src.swizzle = brw_swizzle_for_size(1);
1349 emit(MOV(reg_w, reg_as_src));
1350 }
1351 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1352 dst_reg reg_y = reg;
1353 reg_y.writemask = WRITEMASK_Y;
1354 reg_y.type = BRW_REGISTER_TYPE_D;
1355 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1356 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1357 }
1358 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1359 dst_reg reg_z = reg;
1360 reg_z.writemask = WRITEMASK_Z;
1361 reg_z.type = BRW_REGISTER_TYPE_D;
1362 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1363 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1364 }
1365 }
1366 }
1367
1368 vec4_instruction *
1369 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1370 {
1371 assert(varying < VARYING_SLOT_MAX);
1372 assert(output_reg[varying].type == reg.type);
1373 current_annotation = output_reg_annotation[varying];
1374 if (output_reg[varying].file != BAD_FILE)
1375 return emit(MOV(reg, src_reg(output_reg[varying])));
1376 else
1377 return NULL;
1378 }
1379
1380 void
1381 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1382 {
1383 reg.type = BRW_REGISTER_TYPE_F;
1384 output_reg[varying].type = reg.type;
1385
1386 switch (varying) {
1387 case VARYING_SLOT_PSIZ:
1388 {
1389 /* PSIZ is always in slot 0, and is coupled with other flags. */
1390 current_annotation = "indices, point width, clip flags";
1391 emit_psiz_and_flags(reg);
1392 break;
1393 }
1394 case BRW_VARYING_SLOT_NDC:
1395 current_annotation = "NDC";
1396 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1397 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1398 break;
1399 case VARYING_SLOT_POS:
1400 current_annotation = "gl_Position";
1401 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1402 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1403 break;
1404 case VARYING_SLOT_EDGE:
1405 /* This is present when doing unfilled polygons. We're supposed to copy
1406 * the edge flag from the user-provided vertex array
1407 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1408 * of that attribute (starts as 1.0f). This is then used in clipping to
1409 * determine which edges should be drawn as wireframe.
1410 */
1411 current_annotation = "edge flag";
1412 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1413 glsl_type::float_type, WRITEMASK_XYZW))));
1414 break;
1415 case BRW_VARYING_SLOT_PAD:
1416 /* No need to write to this slot */
1417 break;
1418 default:
1419 emit_generic_urb_slot(reg, varying);
1420 break;
1421 }
1422 }
1423
1424 static int
1425 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1426 {
1427 if (devinfo->gen >= 6) {
1428 /* URB data written (does not include the message header reg) must
1429 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1430 * section 5.4.3.2.2: URB_INTERLEAVED.
1431 *
1432 * URB entries are allocated on a multiple of 1024 bits, so an
1433 * extra 128 bits written here to make the end align to 256 is
1434 * no problem.
1435 */
1436 if ((mlen % 2) != 1)
1437 mlen++;
1438 }
1439
1440 return mlen;
1441 }
1442
1443
1444 /**
1445 * Generates the VUE payload plus the necessary URB write instructions to
1446 * output it.
1447 *
1448 * The VUE layout is documented in Volume 2a.
1449 */
1450 void
1451 vec4_visitor::emit_vertex()
1452 {
1453 /* MRF 0 is reserved for the debugger, so start with message header
1454 * in MRF 1.
1455 */
1456 int base_mrf = 1;
1457 int mrf = base_mrf;
1458 /* In the process of generating our URB write message contents, we
1459 * may need to unspill a register or load from an array. Those
1460 * reads would use MRFs 14-15.
1461 */
1462 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1463
1464 /* The following assertion verifies that max_usable_mrf causes an
1465 * even-numbered amount of URB write data, which will meet gen6's
1466 * requirements for length alignment.
1467 */
1468 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1469
1470 /* First mrf is the g0-based message header containing URB handles and
1471 * such.
1472 */
1473 emit_urb_write_header(mrf++);
1474
1475 if (devinfo->gen < 6) {
1476 emit_ndc_computation();
1477 }
1478
1479 /* We may need to split this up into several URB writes, so do them in a
1480 * loop.
1481 */
1482 int slot = 0;
1483 bool complete = false;
1484 do {
1485 /* URB offset is in URB row increments, and each of our MRFs is half of
1486 * one of those, since we're doing interleaved writes.
1487 */
1488 int offset = slot / 2;
1489
1490 mrf = base_mrf + 1;
1491 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1492 emit_urb_slot(dst_reg(MRF, mrf++),
1493 prog_data->vue_map.slot_to_varying[slot]);
1494
1495 /* If this was max_usable_mrf, we can't fit anything more into this
1496 * URB WRITE. Same thing if we reached the maximum length available.
1497 */
1498 if (mrf > max_usable_mrf ||
1499 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1500 slot++;
1501 break;
1502 }
1503 }
1504
1505 complete = slot >= prog_data->vue_map.num_slots;
1506 current_annotation = "URB write";
1507 vec4_instruction *inst = emit_urb_write_opcode(complete);
1508 inst->base_mrf = base_mrf;
1509 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1510 inst->offset += offset;
1511 } while(!complete);
1512 }
1513
1514
1515 src_reg
1516 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1517 src_reg *reladdr, int reg_offset)
1518 {
1519 /* Because we store the values to scratch interleaved like our
1520 * vertex data, we need to scale the vec4 index by 2.
1521 */
1522 int message_header_scale = 2;
1523
1524 /* Pre-gen6, the message header uses byte offsets instead of vec4
1525 * (16-byte) offset units.
1526 */
1527 if (devinfo->gen < 6)
1528 message_header_scale *= 16;
1529
1530 if (reladdr) {
1531 src_reg index = src_reg(this, glsl_type::int_type);
1532
1533 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1534 brw_imm_d(reg_offset)));
1535 emit_before(block, inst, MUL(dst_reg(index), index,
1536 brw_imm_d(message_header_scale)));
1537
1538 return index;
1539 } else {
1540 return brw_imm_d(reg_offset * message_header_scale);
1541 }
1542 }
1543
1544 src_reg
1545 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1546 src_reg *reladdr, int reg_offset)
1547 {
1548 if (reladdr) {
1549 src_reg index = src_reg(this, glsl_type::int_type);
1550
1551 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1552 brw_imm_d(reg_offset)));
1553
1554 /* Pre-gen6, the message header uses byte offsets instead of vec4
1555 * (16-byte) offset units.
1556 */
1557 if (devinfo->gen < 6) {
1558 emit_before(block, inst, MUL(dst_reg(index), index, brw_imm_d(16)));
1559 }
1560
1561 return index;
1562 } else if (devinfo->gen >= 8) {
1563 /* Store the offset in a GRF so we can send-from-GRF. */
1564 src_reg offset = src_reg(this, glsl_type::int_type);
1565 emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset)));
1566 return offset;
1567 } else {
1568 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1569 return brw_imm_d(reg_offset * message_header_scale);
1570 }
1571 }
1572
1573 /**
1574 * Emits an instruction before @inst to load the value named by @orig_src
1575 * from scratch space at @base_offset to @temp.
1576 *
1577 * @base_offset is measured in 32-byte units (the size of a register).
1578 */
1579 void
1580 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1581 dst_reg temp, src_reg orig_src,
1582 int base_offset)
1583 {
1584 int reg_offset = base_offset + orig_src.reg_offset;
1585 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1586 reg_offset);
1587
1588 emit_before(block, inst, SCRATCH_READ(temp, index));
1589 }
1590
1591 /**
1592 * Emits an instruction after @inst to store the value to be written
1593 * to @orig_dst to scratch space at @base_offset, from @temp.
1594 *
1595 * @base_offset is measured in 32-byte units (the size of a register).
1596 */
1597 void
1598 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1599 int base_offset)
1600 {
1601 int reg_offset = base_offset + inst->dst.reg_offset;
1602 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1603 reg_offset);
1604
1605 /* Create a temporary register to store *inst's result in.
1606 *
1607 * We have to be careful in MOVing from our temporary result register in
1608 * the scratch write. If we swizzle from channels of the temporary that
1609 * weren't initialized, it will confuse live interval analysis, which will
1610 * make spilling fail to make progress.
1611 */
1612 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1613 inst->dst.type),
1614 brw_swizzle_for_mask(inst->dst.writemask));
1615 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1616 inst->dst.writemask));
1617 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1618 if (inst->opcode != BRW_OPCODE_SEL)
1619 write->predicate = inst->predicate;
1620 write->ir = inst->ir;
1621 write->annotation = inst->annotation;
1622 inst->insert_after(block, write);
1623
1624 inst->dst.file = temp.file;
1625 inst->dst.nr = temp.nr;
1626 inst->dst.reg_offset = temp.reg_offset;
1627 inst->dst.reladdr = NULL;
1628 }
1629
1630 /**
1631 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1632 * adds the scratch read(s) before \p inst. The function also checks for
1633 * recursive reladdr scratch accesses, issuing the corresponding scratch
1634 * loads and rewriting reladdr references accordingly.
1635 *
1636 * \return \p src if it did not require a scratch load, otherwise, the
1637 * register holding the result of the scratch load that the caller should
1638 * use to rewrite src.
1639 */
1640 src_reg
1641 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1642 vec4_instruction *inst, src_reg src)
1643 {
1644 /* Resolve recursive reladdr scratch access by calling ourselves
1645 * with src.reladdr
1646 */
1647 if (src.reladdr)
1648 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1649 *src.reladdr);
1650
1651 /* Now handle scratch access on src */
1652 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1653 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1654 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1655 src.nr = temp.nr;
1656 src.reg_offset = temp.reg_offset;
1657 src.reladdr = NULL;
1658 }
1659
1660 return src;
1661 }
1662
1663 /**
1664 * We can't generally support array access in GRF space, because a
1665 * single instruction's destination can only span 2 contiguous
1666 * registers. So, we send all GRF arrays that get variable index
1667 * access to scratch space.
1668 */
1669 void
1670 vec4_visitor::move_grf_array_access_to_scratch()
1671 {
1672 int scratch_loc[this->alloc.count];
1673 memset(scratch_loc, -1, sizeof(scratch_loc));
1674
1675 /* First, calculate the set of virtual GRFs that need to be punted
1676 * to scratch due to having any array access on them, and where in
1677 * scratch.
1678 */
1679 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1680 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1681 if (scratch_loc[inst->dst.nr] == -1) {
1682 scratch_loc[inst->dst.nr] = last_scratch;
1683 last_scratch += this->alloc.sizes[inst->dst.nr];
1684 }
1685
1686 for (src_reg *iter = inst->dst.reladdr;
1687 iter->reladdr;
1688 iter = iter->reladdr) {
1689 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1690 scratch_loc[iter->nr] = last_scratch;
1691 last_scratch += this->alloc.sizes[iter->nr];
1692 }
1693 }
1694 }
1695
1696 for (int i = 0 ; i < 3; i++) {
1697 for (src_reg *iter = &inst->src[i];
1698 iter->reladdr;
1699 iter = iter->reladdr) {
1700 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1701 scratch_loc[iter->nr] = last_scratch;
1702 last_scratch += this->alloc.sizes[iter->nr];
1703 }
1704 }
1705 }
1706 }
1707
1708 /* Now, for anything that will be accessed through scratch, rewrite
1709 * it to load/store. Note that this is a _safe list walk, because
1710 * we may generate a new scratch_write instruction after the one
1711 * we're processing.
1712 */
1713 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1714 /* Set up the annotation tracking for new generated instructions. */
1715 base_ir = inst->ir;
1716 current_annotation = inst->annotation;
1717
1718 /* First handle scratch access on the dst. Notice we have to handle
1719 * the case where the dst's reladdr also points to scratch space.
1720 */
1721 if (inst->dst.reladdr)
1722 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1723 *inst->dst.reladdr);
1724
1725 /* Now that we have handled any (possibly recursive) reladdr scratch
1726 * accesses for dst we can safely do the scratch write for dst itself
1727 */
1728 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1729 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1730
1731 /* Now handle scratch access on any src. In this case, since inst->src[i]
1732 * already is a src_reg, we can just call emit_resolve_reladdr with
1733 * inst->src[i] and it will take care of handling scratch loads for
1734 * both src and src.reladdr (recursively).
1735 */
1736 for (int i = 0 ; i < 3; i++) {
1737 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1738 inst->src[i]);
1739 }
1740 }
1741 }
1742
1743 /**
1744 * Emits an instruction before @inst to load the value named by @orig_src
1745 * from the pull constant buffer (surface) at @base_offset to @temp.
1746 */
1747 void
1748 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1749 dst_reg temp, src_reg orig_src,
1750 int base_offset)
1751 {
1752 int reg_offset = base_offset + orig_src.reg_offset;
1753 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1754 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1755 reg_offset);
1756
1757 emit_pull_constant_load_reg(temp,
1758 brw_imm_ud(index),
1759 offset,
1760 block, inst);
1761
1762 brw_mark_surface_used(&prog_data->base, index);
1763 }
1764
1765 /**
1766 * Implements array access of uniforms by inserting a
1767 * PULL_CONSTANT_LOAD instruction.
1768 *
1769 * Unlike temporary GRF array access (where we don't support it due to
1770 * the difficulty of doing relative addressing on instruction
1771 * destinations), we could potentially do array access of uniforms
1772 * that were loaded in GRF space as push constants. In real-world
1773 * usage we've seen, though, the arrays being used are always larger
1774 * than we could load as push constants, so just always move all
1775 * uniform array access out to a pull constant buffer.
1776 */
1777 void
1778 vec4_visitor::move_uniform_array_access_to_pull_constants()
1779 {
1780 int pull_constant_loc[this->uniforms];
1781 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1782 bool nested_reladdr;
1783
1784 /* Walk through and find array access of uniforms. Put a copy of that
1785 * uniform in the pull constant buffer.
1786 *
1787 * Note that we don't move constant-indexed accesses to arrays. No
1788 * testing has been done of the performance impact of this choice.
1789 */
1790 do {
1791 nested_reladdr = false;
1792
1793 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1794 for (int i = 0 ; i < 3; i++) {
1795 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1796 continue;
1797
1798 int uniform = inst->src[i].nr;
1799
1800 if (inst->src[i].reladdr->reladdr)
1801 nested_reladdr = true; /* will need another pass */
1802
1803 /* If this array isn't already present in the pull constant buffer,
1804 * add it.
1805 */
1806 if (pull_constant_loc[uniform] == -1) {
1807 const gl_constant_value **values =
1808 &stage_prog_data->param[uniform * 4];
1809
1810 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1811
1812 assert(uniform < uniform_array_size);
1813 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1814 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1815 = values[j];
1816 }
1817 }
1818
1819 /* Set up the annotation tracking for new generated instructions. */
1820 base_ir = inst->ir;
1821 current_annotation = inst->annotation;
1822
1823 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1824
1825 emit_pull_constant_load(block, inst, temp, inst->src[i],
1826 pull_constant_loc[uniform]);
1827
1828 inst->src[i].file = temp.file;
1829 inst->src[i].nr = temp.nr;
1830 inst->src[i].reg_offset = temp.reg_offset;
1831 inst->src[i].reladdr = NULL;
1832 }
1833 }
1834 } while (nested_reladdr);
1835
1836 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1837 * no need to track them as larger-than-vec4 objects. This will be
1838 * relied on in cutting out unused uniform vectors from push
1839 * constants.
1840 */
1841 split_uniform_registers();
1842 }
1843
1844 void
1845 vec4_visitor::resolve_ud_negate(src_reg *reg)
1846 {
1847 if (reg->type != BRW_REGISTER_TYPE_UD ||
1848 !reg->negate)
1849 return;
1850
1851 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1852 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1853 *reg = temp;
1854 }
1855
1856 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1857 void *log_data,
1858 const struct brw_sampler_prog_key_data *key_tex,
1859 struct brw_vue_prog_data *prog_data,
1860 const nir_shader *shader,
1861 void *mem_ctx,
1862 bool no_spills,
1863 int shader_time_index)
1864 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1865 key_tex(key_tex),
1866 prog_data(prog_data),
1867 fail_msg(NULL),
1868 first_non_payload_grf(0),
1869 need_all_constants_in_pull_buffer(false),
1870 no_spills(no_spills),
1871 shader_time_index(shader_time_index),
1872 last_scratch(0)
1873 {
1874 this->failed = false;
1875
1876 this->base_ir = NULL;
1877 this->current_annotation = NULL;
1878 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1879
1880 this->virtual_grf_start = NULL;
1881 this->virtual_grf_end = NULL;
1882 this->live_intervals = NULL;
1883
1884 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1885
1886 this->uniforms = 0;
1887
1888 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1889 * at least one. See setup_uniforms() in brw_vec4.cpp.
1890 */
1891 this->uniform_array_size = 1;
1892 if (prog_data) {
1893 this->uniform_array_size =
1894 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1895 }
1896
1897 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1898 }
1899
1900 vec4_visitor::~vec4_visitor()
1901 {
1902 }
1903
1904
1905 void
1906 vec4_visitor::fail(const char *format, ...)
1907 {
1908 va_list va;
1909 char *msg;
1910
1911 if (failed)
1912 return;
1913
1914 failed = true;
1915
1916 va_start(va, format);
1917 msg = ralloc_vasprintf(mem_ctx, format, va);
1918 va_end(va);
1919 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1920
1921 this->fail_msg = msg;
1922
1923 if (debug_enabled) {
1924 fprintf(stderr, "%s", msg);
1925 }
1926 }
1927
1928 } /* namespace brw */