i965/vec4: Don't emit MOVs for unused URB slots.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 src_reg
280 vec4_visitor::fix_3src_operand(const src_reg &src)
281 {
282 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
283 * able to use vertical stride of zero to replicate the vec4 uniform, like
284 *
285 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
286 *
287 * But you can't, since vertical stride is always four in three-source
288 * instructions. Instead, insert a MOV instruction to do the replication so
289 * that the three-source instruction can consume it.
290 */
291
292 /* The MOV is only needed if the source is a uniform or immediate. */
293 if (src.file != UNIFORM && src.file != IMM)
294 return src;
295
296 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
297 return src;
298
299 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
300 expanded.type = src.type;
301 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
302 return src_reg(expanded);
303 }
304
305 src_reg
306 vec4_visitor::resolve_source_modifiers(const src_reg &src)
307 {
308 if (!src.abs && !src.negate)
309 return src;
310
311 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
312 resolved.type = src.type;
313 emit(MOV(resolved, src));
314
315 return src_reg(resolved);
316 }
317
318 src_reg
319 vec4_visitor::fix_math_operand(const src_reg &src)
320 {
321 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
322 return src;
323
324 /* The gen6 math instruction ignores the source modifiers --
325 * swizzle, abs, negate, and at least some parts of the register
326 * region description.
327 *
328 * Rather than trying to enumerate all these cases, *always* expand the
329 * operand to a temp GRF for gen6.
330 *
331 * For gen7, keep the operand as-is, except if immediate, which gen7 still
332 * can't use.
333 */
334
335 if (devinfo->gen == 7 && src.file != IMM)
336 return src;
337
338 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
339 expanded.type = src.type;
340 emit(MOV(expanded, src));
341 return src_reg(expanded);
342 }
343
344 vec4_instruction *
345 vec4_visitor::emit_math(enum opcode opcode,
346 const dst_reg &dst,
347 const src_reg &src0, const src_reg &src1)
348 {
349 vec4_instruction *math =
350 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
351
352 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
353 /* MATH on Gen6 must be align1, so we can't do writemasks. */
354 math->dst = dst_reg(this, glsl_type::vec4_type);
355 math->dst.type = dst.type;
356 math = emit(MOV(dst, src_reg(math->dst)));
357 } else if (devinfo->gen < 6) {
358 math->base_mrf = 1;
359 math->mlen = src1.file == BAD_FILE ? 1 : 2;
360 }
361
362 return math;
363 }
364
365 void
366 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
367 {
368 if (devinfo->gen < 7) {
369 unreachable("ir_unop_pack_half_2x16 should be lowered");
370 }
371
372 assert(dst.type == BRW_REGISTER_TYPE_UD);
373 assert(src0.type == BRW_REGISTER_TYPE_F);
374
375 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
376 *
377 * Because this instruction does not have a 16-bit floating-point type,
378 * the destination data type must be Word (W).
379 *
380 * The destination must be DWord-aligned and specify a horizontal stride
381 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
382 * each destination channel and the upper word is not modified.
383 *
384 * The above restriction implies that the f32to16 instruction must use
385 * align1 mode, because only in align1 mode is it possible to specify
386 * horizontal stride. We choose here to defy the hardware docs and emit
387 * align16 instructions.
388 *
389 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
390 * instructions. I was partially successful in that the code passed all
391 * tests. However, the code was dubiously correct and fragile, and the
392 * tests were not harsh enough to probe that frailty. Not trusting the
393 * code, I chose instead to remain in align16 mode in defiance of the hw
394 * docs).
395 *
396 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
397 * simulator, emitting a f32to16 in align16 mode with UD as destination
398 * data type is safe. The behavior differs from that specified in the PRM
399 * in that the upper word of each destination channel is cleared to 0.
400 */
401
402 dst_reg tmp_dst(this, glsl_type::uvec2_type);
403 src_reg tmp_src(tmp_dst);
404
405 #if 0
406 /* Verify the undocumented behavior on which the following instructions
407 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
408 * then the result of the bit-or instruction below will be incorrect.
409 *
410 * You should inspect the disasm output in order to verify that the MOV is
411 * not optimized away.
412 */
413 emit(MOV(tmp_dst, src_reg(0x12345678u)));
414 #endif
415
416 /* Give tmp the form below, where "." means untouched.
417 *
418 * w z y x w z y x
419 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
420 *
421 * That the upper word of each write-channel be 0 is required for the
422 * following bit-shift and bit-or instructions to work. Note that this
423 * relies on the undocumented hardware behavior mentioned above.
424 */
425 tmp_dst.writemask = WRITEMASK_XY;
426 emit(F32TO16(tmp_dst, src0));
427
428 /* Give the write-channels of dst the form:
429 * 0xhhhh0000
430 */
431 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
432 emit(SHL(dst, tmp_src, src_reg(16u)));
433
434 /* Finally, give the write-channels of dst the form of packHalf2x16's
435 * output:
436 * 0xhhhhllll
437 */
438 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
439 emit(OR(dst, src_reg(dst), tmp_src));
440 }
441
442 void
443 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
444 {
445 if (devinfo->gen < 7) {
446 unreachable("ir_unop_unpack_half_2x16 should be lowered");
447 }
448
449 assert(dst.type == BRW_REGISTER_TYPE_F);
450 assert(src0.type == BRW_REGISTER_TYPE_UD);
451
452 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
453 *
454 * Because this instruction does not have a 16-bit floating-point type,
455 * the source data type must be Word (W). The destination type must be
456 * F (Float).
457 *
458 * To use W as the source data type, we must adjust horizontal strides,
459 * which is only possible in align1 mode. All my [chadv] attempts at
460 * emitting align1 instructions for unpackHalf2x16 failed to pass the
461 * Piglit tests, so I gave up.
462 *
463 * I've verified that, on gen7 hardware and the simulator, it is safe to
464 * emit f16to32 in align16 mode with UD as source data type.
465 */
466
467 dst_reg tmp_dst(this, glsl_type::uvec2_type);
468 src_reg tmp_src(tmp_dst);
469
470 tmp_dst.writemask = WRITEMASK_X;
471 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
472
473 tmp_dst.writemask = WRITEMASK_Y;
474 emit(SHR(tmp_dst, src0, src_reg(16u)));
475
476 dst.writemask = WRITEMASK_XY;
477 emit(F16TO32(dst, tmp_src));
478 }
479
480 void
481 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
482 {
483 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
484 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
485 * is not suitable to generate the shift values, but we can use the packed
486 * vector float and a type-converting MOV.
487 */
488 dst_reg shift(this, glsl_type::uvec4_type);
489 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
490
491 dst_reg shifted(this, glsl_type::uvec4_type);
492 src0.swizzle = BRW_SWIZZLE_XXXX;
493 emit(SHR(shifted, src0, src_reg(shift)));
494
495 shifted.type = BRW_REGISTER_TYPE_UB;
496 dst_reg f(this, glsl_type::vec4_type);
497 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
498
499 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
500 }
501
502 void
503 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
504 {
505 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
506 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
507 * is not suitable to generate the shift values, but we can use the packed
508 * vector float and a type-converting MOV.
509 */
510 dst_reg shift(this, glsl_type::uvec4_type);
511 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
512
513 dst_reg shifted(this, glsl_type::uvec4_type);
514 src0.swizzle = BRW_SWIZZLE_XXXX;
515 emit(SHR(shifted, src0, src_reg(shift)));
516
517 shifted.type = BRW_REGISTER_TYPE_B;
518 dst_reg f(this, glsl_type::vec4_type);
519 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
520
521 dst_reg scaled(this, glsl_type::vec4_type);
522 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
523
524 dst_reg max(this, glsl_type::vec4_type);
525 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
526 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
527 }
528
529 void
530 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
531 {
532 dst_reg saturated(this, glsl_type::vec4_type);
533 vec4_instruction *inst = emit(MOV(saturated, src0));
534 inst->saturate = true;
535
536 dst_reg scaled(this, glsl_type::vec4_type);
537 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
538
539 dst_reg rounded(this, glsl_type::vec4_type);
540 emit(RNDE(rounded, src_reg(scaled)));
541
542 dst_reg u(this, glsl_type::uvec4_type);
543 emit(MOV(u, src_reg(rounded)));
544
545 src_reg bytes(u);
546 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
547 }
548
549 void
550 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
551 {
552 dst_reg max(this, glsl_type::vec4_type);
553 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
554
555 dst_reg min(this, glsl_type::vec4_type);
556 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
557
558 dst_reg scaled(this, glsl_type::vec4_type);
559 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
560
561 dst_reg rounded(this, glsl_type::vec4_type);
562 emit(RNDE(rounded, src_reg(scaled)));
563
564 dst_reg i(this, glsl_type::ivec4_type);
565 emit(MOV(i, src_reg(rounded)));
566
567 src_reg bytes(i);
568 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
569 }
570
571 /**
572 * Returns the minimum number of vec4 elements needed to pack a type.
573 *
574 * For simple types, it will return 1 (a single vec4); for matrices, the
575 * number of columns; for array and struct, the sum of the vec4_size of
576 * each of its elements; and for sampler and atomic, zero.
577 *
578 * This method is useful to calculate how much register space is needed to
579 * store a particular type.
580 */
581 extern "C" int
582 type_size_vec4(const struct glsl_type *type)
583 {
584 unsigned int i;
585 int size;
586
587 switch (type->base_type) {
588 case GLSL_TYPE_UINT:
589 case GLSL_TYPE_INT:
590 case GLSL_TYPE_FLOAT:
591 case GLSL_TYPE_BOOL:
592 if (type->is_matrix()) {
593 return type->matrix_columns;
594 } else {
595 /* Regardless of size of vector, it gets a vec4. This is bad
596 * packing for things like floats, but otherwise arrays become a
597 * mess. Hopefully a later pass over the code can pack scalars
598 * down if appropriate.
599 */
600 return 1;
601 }
602 case GLSL_TYPE_ARRAY:
603 assert(type->length > 0);
604 return type_size_vec4(type->fields.array) * type->length;
605 case GLSL_TYPE_STRUCT:
606 size = 0;
607 for (i = 0; i < type->length; i++) {
608 size += type_size_vec4(type->fields.structure[i].type);
609 }
610 return size;
611 case GLSL_TYPE_SUBROUTINE:
612 return 1;
613
614 case GLSL_TYPE_SAMPLER:
615 /* Samplers take up no register space, since they're baked in at
616 * link time.
617 */
618 return 0;
619 case GLSL_TYPE_ATOMIC_UINT:
620 return 0;
621 case GLSL_TYPE_IMAGE:
622 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
623 case GLSL_TYPE_VOID:
624 case GLSL_TYPE_DOUBLE:
625 case GLSL_TYPE_ERROR:
626 case GLSL_TYPE_INTERFACE:
627 unreachable("not reached");
628 }
629
630 return 0;
631 }
632
633 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
634 {
635 init();
636
637 this->file = GRF;
638 this->reg = v->alloc.allocate(type_size_vec4(type));
639
640 if (type->is_array() || type->is_record()) {
641 this->swizzle = BRW_SWIZZLE_NOOP;
642 } else {
643 this->swizzle = brw_swizzle_for_size(type->vector_elements);
644 }
645
646 this->type = brw_type_for_base_type(type);
647 }
648
649 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
650 {
651 assert(size > 0);
652
653 init();
654
655 this->file = GRF;
656 this->reg = v->alloc.allocate(type_size_vec4(type) * size);
657
658 this->swizzle = BRW_SWIZZLE_NOOP;
659
660 this->type = brw_type_for_base_type(type);
661 }
662
663 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
664 {
665 init();
666
667 this->file = GRF;
668 this->reg = v->alloc.allocate(type_size_vec4(type));
669
670 if (type->is_array() || type->is_record()) {
671 this->writemask = WRITEMASK_XYZW;
672 } else {
673 this->writemask = (1 << type->vector_elements) - 1;
674 }
675
676 this->type = brw_type_for_base_type(type);
677 }
678
679 vec4_instruction *
680 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
681 src_reg src0, src_reg src1)
682 {
683 vec4_instruction *inst;
684
685 if (devinfo->gen >= 6) {
686 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
687 inst->conditional_mod = conditionalmod;
688 } else {
689 emit(CMP(dst, src0, src1, conditionalmod));
690
691 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
692 inst->predicate = BRW_PREDICATE_NORMAL;
693 }
694
695 return inst;
696 }
697
698 vec4_instruction *
699 vec4_visitor::emit_lrp(const dst_reg &dst,
700 const src_reg &x, const src_reg &y, const src_reg &a)
701 {
702 if (devinfo->gen >= 6) {
703 /* Note that the instruction's argument order is reversed from GLSL
704 * and the IR.
705 */
706 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
707 fix_3src_operand(x)));
708 } else {
709 /* Earlier generations don't support three source operations, so we
710 * need to emit x*(1-a) + y*a.
711 */
712 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
713 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
714 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
715 y_times_a.writemask = dst.writemask;
716 one_minus_a.writemask = dst.writemask;
717 x_times_one_minus_a.writemask = dst.writemask;
718
719 emit(MUL(y_times_a, y, a));
720 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
721 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
722 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
723 }
724 }
725
726 /**
727 * Emits the instructions needed to perform a pull constant load. before_block
728 * and before_inst can be NULL in which case the instruction will be appended
729 * to the end of the instruction list.
730 */
731 void
732 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
733 src_reg surf_index,
734 src_reg offset_reg,
735 bblock_t *before_block,
736 vec4_instruction *before_inst)
737 {
738 assert((before_inst == NULL && before_block == NULL) ||
739 (before_inst && before_block));
740
741 vec4_instruction *pull;
742
743 if (devinfo->gen >= 9) {
744 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
745 src_reg header(this, glsl_type::uvec4_type, 2);
746
747 pull = new(mem_ctx)
748 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
749 dst_reg(header));
750
751 if (before_inst)
752 emit_before(before_block, before_inst, pull);
753 else
754 emit(pull);
755
756 dst_reg index_reg = retype(offset(dst_reg(header), 1),
757 offset_reg.type);
758 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
759
760 if (before_inst)
761 emit_before(before_block, before_inst, pull);
762 else
763 emit(pull);
764
765 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
766 dst,
767 surf_index,
768 header);
769 pull->mlen = 2;
770 pull->header_size = 1;
771 } else if (devinfo->gen >= 7) {
772 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
773
774 grf_offset.type = offset_reg.type;
775
776 pull = MOV(grf_offset, offset_reg);
777
778 if (before_inst)
779 emit_before(before_block, before_inst, pull);
780 else
781 emit(pull);
782
783 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
784 dst,
785 surf_index,
786 src_reg(grf_offset));
787 pull->mlen = 1;
788 } else {
789 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
790 dst,
791 surf_index,
792 offset_reg);
793 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
794 pull->mlen = 1;
795 }
796
797 if (before_inst)
798 emit_before(before_block, before_inst, pull);
799 else
800 emit(pull);
801 }
802
803 src_reg
804 vec4_visitor::emit_uniformize(const src_reg &src)
805 {
806 const src_reg chan_index(this, glsl_type::uint_type);
807 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
808 src.type);
809
810 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
811 ->force_writemask_all = true;
812 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
813 ->force_writemask_all = true;
814
815 return src_reg(dst);
816 }
817
818 src_reg
819 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
820 src_reg coordinate, src_reg sampler)
821 {
822 vec4_instruction *inst =
823 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
824 dst_reg(this, glsl_type::uvec4_type));
825 inst->base_mrf = 2;
826 inst->src[1] = sampler;
827
828 int param_base;
829
830 if (devinfo->gen >= 9) {
831 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
832 vec4_instruction *header_inst = new(mem_ctx)
833 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
834 dst_reg(MRF, inst->base_mrf));
835
836 emit(header_inst);
837
838 inst->mlen = 2;
839 inst->header_size = 1;
840 param_base = inst->base_mrf + 1;
841 } else {
842 inst->mlen = 1;
843 param_base = inst->base_mrf;
844 }
845
846 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
847 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
848 int zero_mask = 0xf & ~coord_mask;
849
850 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
851 coordinate));
852
853 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
854 src_reg(0)));
855
856 emit(inst);
857 return src_reg(inst->dst);
858 }
859
860 bool
861 vec4_visitor::is_high_sampler(src_reg sampler)
862 {
863 if (devinfo->gen < 8 && !devinfo->is_haswell)
864 return false;
865
866 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
867 }
868
869 void
870 vec4_visitor::emit_texture(ir_texture_opcode op,
871 dst_reg dest,
872 const glsl_type *dest_type,
873 src_reg coordinate,
874 int coord_components,
875 src_reg shadow_comparitor,
876 src_reg lod, src_reg lod2,
877 src_reg sample_index,
878 uint32_t constant_offset,
879 src_reg offset_value,
880 src_reg mcs,
881 bool is_cube_array,
882 uint32_t sampler,
883 src_reg sampler_reg)
884 {
885 enum opcode opcode;
886 switch (op) {
887 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
888 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
889 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
890 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
891 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
892 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
893 case ir_tg4: opcode = offset_value.file != BAD_FILE
894 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
895 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
896 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
897 case ir_txb:
898 unreachable("TXB is not valid for vertex shaders.");
899 case ir_lod:
900 unreachable("LOD is not valid for vertex shaders.");
901 default:
902 unreachable("Unrecognized tex op");
903 }
904
905 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
906 opcode, dst_reg(this, dest_type));
907
908 inst->offset = constant_offset;
909
910 /* The message header is necessary for:
911 * - Gen4 (always)
912 * - Gen9+ for selecting SIMD4x2
913 * - Texel offsets
914 * - Gather channel selection
915 * - Sampler indices too large to fit in a 4-bit value.
916 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
917 */
918 inst->header_size =
919 (devinfo->gen < 5 || devinfo->gen >= 9 ||
920 inst->offset != 0 || op == ir_tg4 ||
921 op == ir_texture_samples ||
922 is_high_sampler(sampler_reg)) ? 1 : 0;
923 inst->base_mrf = 2;
924 inst->mlen = inst->header_size;
925 inst->dst.writemask = WRITEMASK_XYZW;
926 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
927
928 inst->src[1] = sampler_reg;
929
930 /* MRF for the first parameter */
931 int param_base = inst->base_mrf + inst->header_size;
932
933 if (op == ir_txs || op == ir_query_levels) {
934 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
935 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
936 inst->mlen++;
937 } else if (op == ir_texture_samples) {
938 inst->dst.writemask = WRITEMASK_X;
939 } else {
940 /* Load the coordinate */
941 /* FINISHME: gl_clamp_mask and saturate */
942 int coord_mask = (1 << coord_components) - 1;
943 int zero_mask = 0xf & ~coord_mask;
944
945 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
946 coordinate));
947 inst->mlen++;
948
949 if (zero_mask != 0) {
950 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
951 src_reg(0)));
952 }
953 /* Load the shadow comparitor */
954 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
955 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
956 WRITEMASK_X),
957 shadow_comparitor));
958 inst->mlen++;
959 }
960
961 /* Load the LOD info */
962 if (op == ir_tex || op == ir_txl) {
963 int mrf, writemask;
964 if (devinfo->gen >= 5) {
965 mrf = param_base + 1;
966 if (shadow_comparitor.file != BAD_FILE) {
967 writemask = WRITEMASK_Y;
968 /* mlen already incremented */
969 } else {
970 writemask = WRITEMASK_X;
971 inst->mlen++;
972 }
973 } else /* devinfo->gen == 4 */ {
974 mrf = param_base;
975 writemask = WRITEMASK_W;
976 }
977 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
978 } else if (op == ir_txf) {
979 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
980 } else if (op == ir_txf_ms) {
981 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
982 sample_index));
983 if (devinfo->gen >= 7) {
984 /* MCS data is in the first channel of `mcs`, but we need to get it into
985 * the .y channel of the second vec4 of params, so replicate .x across
986 * the whole vec4 and then mask off everything except .y
987 */
988 mcs.swizzle = BRW_SWIZZLE_XXXX;
989 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
990 mcs));
991 }
992 inst->mlen++;
993 } else if (op == ir_txd) {
994 const brw_reg_type type = lod.type;
995
996 if (devinfo->gen >= 5) {
997 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
998 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
999 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1000 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1001 inst->mlen++;
1002
1003 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1004 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1005 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1006 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1007 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1008 inst->mlen++;
1009
1010 if (shadow_comparitor.file != BAD_FILE) {
1011 emit(MOV(dst_reg(MRF, param_base + 2,
1012 shadow_comparitor.type, WRITEMASK_Z),
1013 shadow_comparitor));
1014 }
1015 }
1016 } else /* devinfo->gen == 4 */ {
1017 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1018 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1019 inst->mlen += 2;
1020 }
1021 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1022 if (shadow_comparitor.file != BAD_FILE) {
1023 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1024 shadow_comparitor));
1025 }
1026
1027 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1028 offset_value));
1029 inst->mlen++;
1030 }
1031 }
1032
1033 emit(inst);
1034
1035 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1036 * spec requires layers.
1037 */
1038 if (op == ir_txs && is_cube_array) {
1039 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1040 writemask(inst->dst, WRITEMASK_Z),
1041 src_reg(inst->dst), src_reg(6));
1042 }
1043
1044 if (devinfo->gen == 6 && op == ir_tg4) {
1045 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1046 }
1047
1048 swizzle_result(op, dest,
1049 src_reg(inst->dst), sampler, dest_type);
1050 }
1051
1052 /**
1053 * Apply workarounds for Gen6 gather with UINT/SINT
1054 */
1055 void
1056 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1057 {
1058 if (!wa)
1059 return;
1060
1061 int width = (wa & WA_8BIT) ? 8 : 16;
1062 dst_reg dst_f = dst;
1063 dst_f.type = BRW_REGISTER_TYPE_F;
1064
1065 /* Convert from UNORM to UINT */
1066 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1067 emit(MOV(dst, src_reg(dst_f)));
1068
1069 if (wa & WA_SIGN) {
1070 /* Reinterpret the UINT value as a signed INT value by
1071 * shifting the sign bit into place, then shifting back
1072 * preserving sign.
1073 */
1074 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1075 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1076 }
1077 }
1078
1079 /**
1080 * Set up the gather channel based on the swizzle, for gather4.
1081 */
1082 uint32_t
1083 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1084 {
1085 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1086 switch (swiz) {
1087 case SWIZZLE_X: return 0;
1088 case SWIZZLE_Y:
1089 /* gather4 sampler is broken for green channel on RG32F --
1090 * we must ask for blue instead.
1091 */
1092 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1093 return 2;
1094 return 1;
1095 case SWIZZLE_Z: return 2;
1096 case SWIZZLE_W: return 3;
1097 default:
1098 unreachable("Not reached"); /* zero, one swizzles handled already */
1099 }
1100 }
1101
1102 void
1103 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1104 src_reg orig_val, uint32_t sampler,
1105 const glsl_type *dest_type)
1106 {
1107 int s = key_tex->swizzles[sampler];
1108
1109 dst_reg swizzled_result = dest;
1110
1111 if (op == ir_query_levels) {
1112 /* # levels is in .w */
1113 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1114 emit(MOV(swizzled_result, orig_val));
1115 return;
1116 }
1117
1118 if (op == ir_txs || dest_type == glsl_type::float_type
1119 || s == SWIZZLE_NOOP || op == ir_tg4) {
1120 emit(MOV(swizzled_result, orig_val));
1121 return;
1122 }
1123
1124
1125 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1126 int swizzle[4] = {0};
1127
1128 for (int i = 0; i < 4; i++) {
1129 switch (GET_SWZ(s, i)) {
1130 case SWIZZLE_ZERO:
1131 zero_mask |= (1 << i);
1132 break;
1133 case SWIZZLE_ONE:
1134 one_mask |= (1 << i);
1135 break;
1136 default:
1137 copy_mask |= (1 << i);
1138 swizzle[i] = GET_SWZ(s, i);
1139 break;
1140 }
1141 }
1142
1143 if (copy_mask) {
1144 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1145 swizzled_result.writemask = copy_mask;
1146 emit(MOV(swizzled_result, orig_val));
1147 }
1148
1149 if (zero_mask) {
1150 swizzled_result.writemask = zero_mask;
1151 emit(MOV(swizzled_result, src_reg(0.0f)));
1152 }
1153
1154 if (one_mask) {
1155 swizzled_result.writemask = one_mask;
1156 emit(MOV(swizzled_result, src_reg(1.0f)));
1157 }
1158 }
1159
1160 void
1161 vec4_visitor::gs_emit_vertex(int stream_id)
1162 {
1163 unreachable("not reached");
1164 }
1165
1166 void
1167 vec4_visitor::gs_end_primitive()
1168 {
1169 unreachable("not reached");
1170 }
1171
1172 void
1173 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1174 dst_reg dst, src_reg offset,
1175 src_reg src0, src_reg src1)
1176 {
1177 unsigned mlen = 0;
1178
1179 /* Set the atomic operation offset. */
1180 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
1181 mlen++;
1182
1183 /* Set the atomic operation arguments. */
1184 if (src0.file != BAD_FILE) {
1185 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
1186 mlen++;
1187 }
1188
1189 if (src1.file != BAD_FILE) {
1190 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
1191 mlen++;
1192 }
1193
1194 /* Emit the instruction. Note that this maps to the normal SIMD8
1195 * untyped atomic message on Ivy Bridge, but that's OK because
1196 * unused channels will be masked out.
1197 */
1198 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1199 brw_message_reg(0),
1200 src_reg(surf_index), src_reg(atomic_op));
1201 inst->mlen = mlen;
1202 }
1203
1204 void
1205 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1206 src_reg offset)
1207 {
1208 /* Set the surface read offset. */
1209 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
1210
1211 /* Emit the instruction. Note that this maps to the normal SIMD8
1212 * untyped surface read message, but that's OK because unused
1213 * channels will be masked out.
1214 */
1215 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1216 brw_message_reg(0),
1217 src_reg(surf_index), src_reg(1));
1218 inst->mlen = 1;
1219 }
1220
1221 void
1222 vec4_visitor::emit_ndc_computation()
1223 {
1224 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1225 return;
1226
1227 /* Get the position */
1228 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1229
1230 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1231 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1232 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1233
1234 current_annotation = "NDC";
1235 dst_reg ndc_w = ndc;
1236 ndc_w.writemask = WRITEMASK_W;
1237 src_reg pos_w = pos;
1238 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1239 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1240
1241 dst_reg ndc_xyz = ndc;
1242 ndc_xyz.writemask = WRITEMASK_XYZ;
1243
1244 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1245 }
1246
1247 void
1248 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1249 {
1250 if (devinfo->gen < 6 &&
1251 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1252 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1253 devinfo->has_negative_rhw_bug)) {
1254 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1255 dst_reg header1_w = header1;
1256 header1_w.writemask = WRITEMASK_W;
1257
1258 emit(MOV(header1, 0u));
1259
1260 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1261 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1262
1263 current_annotation = "Point size";
1264 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1265 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1266 }
1267
1268 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1269 current_annotation = "Clipping flags";
1270 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1271 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1272
1273 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1274 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1275 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1276
1277 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1278 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1279 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1280 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1281 }
1282
1283 /* i965 clipping workaround:
1284 * 1) Test for -ve rhw
1285 * 2) If set,
1286 * set ndc = (0,0,0,0)
1287 * set ucp[6] = 1
1288 *
1289 * Later, clipping will detect ucp[6] and ensure the primitive is
1290 * clipped against all fixed planes.
1291 */
1292 if (devinfo->has_negative_rhw_bug &&
1293 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1294 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1295 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1296 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1297 vec4_instruction *inst;
1298 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1299 inst->predicate = BRW_PREDICATE_NORMAL;
1300 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1301 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1302 inst->predicate = BRW_PREDICATE_NORMAL;
1303 }
1304
1305 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1306 } else if (devinfo->gen < 6) {
1307 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1308 } else {
1309 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1310 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1311 dst_reg reg_w = reg;
1312 reg_w.writemask = WRITEMASK_W;
1313 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1314 reg_as_src.type = reg_w.type;
1315 reg_as_src.swizzle = brw_swizzle_for_size(1);
1316 emit(MOV(reg_w, reg_as_src));
1317 }
1318 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1319 dst_reg reg_y = reg;
1320 reg_y.writemask = WRITEMASK_Y;
1321 reg_y.type = BRW_REGISTER_TYPE_D;
1322 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1323 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1324 }
1325 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1326 dst_reg reg_z = reg;
1327 reg_z.writemask = WRITEMASK_Z;
1328 reg_z.type = BRW_REGISTER_TYPE_D;
1329 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1330 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1331 }
1332 }
1333 }
1334
1335 vec4_instruction *
1336 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1337 {
1338 assert(varying < VARYING_SLOT_MAX);
1339 assert(output_reg[varying].type == reg.type);
1340 current_annotation = output_reg_annotation[varying];
1341 if (output_reg[varying].file != BAD_FILE)
1342 return emit(MOV(reg, src_reg(output_reg[varying])));
1343 else
1344 return NULL;
1345 }
1346
1347 void
1348 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1349 {
1350 reg.type = BRW_REGISTER_TYPE_F;
1351 output_reg[varying].type = reg.type;
1352
1353 switch (varying) {
1354 case VARYING_SLOT_PSIZ:
1355 {
1356 /* PSIZ is always in slot 0, and is coupled with other flags. */
1357 current_annotation = "indices, point width, clip flags";
1358 emit_psiz_and_flags(reg);
1359 break;
1360 }
1361 case BRW_VARYING_SLOT_NDC:
1362 current_annotation = "NDC";
1363 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1364 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1365 break;
1366 case VARYING_SLOT_POS:
1367 current_annotation = "gl_Position";
1368 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1369 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1370 break;
1371 case VARYING_SLOT_EDGE:
1372 /* This is present when doing unfilled polygons. We're supposed to copy
1373 * the edge flag from the user-provided vertex array
1374 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1375 * of that attribute (starts as 1.0f). This is then used in clipping to
1376 * determine which edges should be drawn as wireframe.
1377 */
1378 current_annotation = "edge flag";
1379 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1380 glsl_type::float_type, WRITEMASK_XYZW))));
1381 break;
1382 case BRW_VARYING_SLOT_PAD:
1383 /* No need to write to this slot */
1384 break;
1385 default:
1386 emit_generic_urb_slot(reg, varying);
1387 break;
1388 }
1389 }
1390
1391 static int
1392 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1393 {
1394 if (devinfo->gen >= 6) {
1395 /* URB data written (does not include the message header reg) must
1396 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1397 * section 5.4.3.2.2: URB_INTERLEAVED.
1398 *
1399 * URB entries are allocated on a multiple of 1024 bits, so an
1400 * extra 128 bits written here to make the end align to 256 is
1401 * no problem.
1402 */
1403 if ((mlen % 2) != 1)
1404 mlen++;
1405 }
1406
1407 return mlen;
1408 }
1409
1410
1411 /**
1412 * Generates the VUE payload plus the necessary URB write instructions to
1413 * output it.
1414 *
1415 * The VUE layout is documented in Volume 2a.
1416 */
1417 void
1418 vec4_visitor::emit_vertex()
1419 {
1420 /* MRF 0 is reserved for the debugger, so start with message header
1421 * in MRF 1.
1422 */
1423 int base_mrf = 1;
1424 int mrf = base_mrf;
1425 /* In the process of generating our URB write message contents, we
1426 * may need to unspill a register or load from an array. Those
1427 * reads would use MRFs 14-15.
1428 */
1429 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1430
1431 /* The following assertion verifies that max_usable_mrf causes an
1432 * even-numbered amount of URB write data, which will meet gen6's
1433 * requirements for length alignment.
1434 */
1435 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1436
1437 /* First mrf is the g0-based message header containing URB handles and
1438 * such.
1439 */
1440 emit_urb_write_header(mrf++);
1441
1442 if (devinfo->gen < 6) {
1443 emit_ndc_computation();
1444 }
1445
1446 /* We may need to split this up into several URB writes, so do them in a
1447 * loop.
1448 */
1449 int slot = 0;
1450 bool complete = false;
1451 do {
1452 /* URB offset is in URB row increments, and each of our MRFs is half of
1453 * one of those, since we're doing interleaved writes.
1454 */
1455 int offset = slot / 2;
1456
1457 mrf = base_mrf + 1;
1458 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1459 emit_urb_slot(dst_reg(MRF, mrf++),
1460 prog_data->vue_map.slot_to_varying[slot]);
1461
1462 /* If this was max_usable_mrf, we can't fit anything more into this
1463 * URB WRITE. Same thing if we reached the maximum length available.
1464 */
1465 if (mrf > max_usable_mrf ||
1466 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1467 slot++;
1468 break;
1469 }
1470 }
1471
1472 complete = slot >= prog_data->vue_map.num_slots;
1473 current_annotation = "URB write";
1474 vec4_instruction *inst = emit_urb_write_opcode(complete);
1475 inst->base_mrf = base_mrf;
1476 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1477 inst->offset += offset;
1478 } while(!complete);
1479 }
1480
1481
1482 src_reg
1483 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1484 src_reg *reladdr, int reg_offset)
1485 {
1486 /* Because we store the values to scratch interleaved like our
1487 * vertex data, we need to scale the vec4 index by 2.
1488 */
1489 int message_header_scale = 2;
1490
1491 /* Pre-gen6, the message header uses byte offsets instead of vec4
1492 * (16-byte) offset units.
1493 */
1494 if (devinfo->gen < 6)
1495 message_header_scale *= 16;
1496
1497 if (reladdr) {
1498 src_reg index = src_reg(this, glsl_type::int_type);
1499
1500 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1501 src_reg(reg_offset)));
1502 emit_before(block, inst, MUL(dst_reg(index), index,
1503 src_reg(message_header_scale)));
1504
1505 return index;
1506 } else {
1507 return src_reg(reg_offset * message_header_scale);
1508 }
1509 }
1510
1511 src_reg
1512 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1513 src_reg *reladdr, int reg_offset)
1514 {
1515 if (reladdr) {
1516 src_reg index = src_reg(this, glsl_type::int_type);
1517
1518 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1519 src_reg(reg_offset)));
1520
1521 /* Pre-gen6, the message header uses byte offsets instead of vec4
1522 * (16-byte) offset units.
1523 */
1524 if (devinfo->gen < 6) {
1525 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1526 }
1527
1528 return index;
1529 } else if (devinfo->gen >= 8) {
1530 /* Store the offset in a GRF so we can send-from-GRF. */
1531 src_reg offset = src_reg(this, glsl_type::int_type);
1532 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1533 return offset;
1534 } else {
1535 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1536 return src_reg(reg_offset * message_header_scale);
1537 }
1538 }
1539
1540 /**
1541 * Emits an instruction before @inst to load the value named by @orig_src
1542 * from scratch space at @base_offset to @temp.
1543 *
1544 * @base_offset is measured in 32-byte units (the size of a register).
1545 */
1546 void
1547 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1548 dst_reg temp, src_reg orig_src,
1549 int base_offset)
1550 {
1551 int reg_offset = base_offset + orig_src.reg_offset;
1552 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1553 reg_offset);
1554
1555 emit_before(block, inst, SCRATCH_READ(temp, index));
1556 }
1557
1558 /**
1559 * Emits an instruction after @inst to store the value to be written
1560 * to @orig_dst to scratch space at @base_offset, from @temp.
1561 *
1562 * @base_offset is measured in 32-byte units (the size of a register).
1563 */
1564 void
1565 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1566 int base_offset)
1567 {
1568 int reg_offset = base_offset + inst->dst.reg_offset;
1569 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1570 reg_offset);
1571
1572 /* Create a temporary register to store *inst's result in.
1573 *
1574 * We have to be careful in MOVing from our temporary result register in
1575 * the scratch write. If we swizzle from channels of the temporary that
1576 * weren't initialized, it will confuse live interval analysis, which will
1577 * make spilling fail to make progress.
1578 */
1579 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1580 inst->dst.type),
1581 brw_swizzle_for_mask(inst->dst.writemask));
1582 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1583 inst->dst.writemask));
1584 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1585 if (inst->opcode != BRW_OPCODE_SEL)
1586 write->predicate = inst->predicate;
1587 write->ir = inst->ir;
1588 write->annotation = inst->annotation;
1589 inst->insert_after(block, write);
1590
1591 inst->dst.file = temp.file;
1592 inst->dst.reg = temp.reg;
1593 inst->dst.reg_offset = temp.reg_offset;
1594 inst->dst.reladdr = NULL;
1595 }
1596
1597 /**
1598 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1599 * adds the scratch read(s) before \p inst. The function also checks for
1600 * recursive reladdr scratch accesses, issuing the corresponding scratch
1601 * loads and rewriting reladdr references accordingly.
1602 *
1603 * \return \p src if it did not require a scratch load, otherwise, the
1604 * register holding the result of the scratch load that the caller should
1605 * use to rewrite src.
1606 */
1607 src_reg
1608 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1609 vec4_instruction *inst, src_reg src)
1610 {
1611 /* Resolve recursive reladdr scratch access by calling ourselves
1612 * with src.reladdr
1613 */
1614 if (src.reladdr)
1615 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1616 *src.reladdr);
1617
1618 /* Now handle scratch access on src */
1619 if (src.file == GRF && scratch_loc[src.reg] != -1) {
1620 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1621 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1622 src.reg = temp.reg;
1623 src.reg_offset = temp.reg_offset;
1624 src.reladdr = NULL;
1625 }
1626
1627 return src;
1628 }
1629
1630 /**
1631 * We can't generally support array access in GRF space, because a
1632 * single instruction's destination can only span 2 contiguous
1633 * registers. So, we send all GRF arrays that get variable index
1634 * access to scratch space.
1635 */
1636 void
1637 vec4_visitor::move_grf_array_access_to_scratch()
1638 {
1639 int scratch_loc[this->alloc.count];
1640 memset(scratch_loc, -1, sizeof(scratch_loc));
1641
1642 /* First, calculate the set of virtual GRFs that need to be punted
1643 * to scratch due to having any array access on them, and where in
1644 * scratch.
1645 */
1646 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1647 if (inst->dst.file == GRF && inst->dst.reladdr) {
1648 if (scratch_loc[inst->dst.reg] == -1) {
1649 scratch_loc[inst->dst.reg] = last_scratch;
1650 last_scratch += this->alloc.sizes[inst->dst.reg];
1651 }
1652
1653 for (src_reg *iter = inst->dst.reladdr;
1654 iter->reladdr;
1655 iter = iter->reladdr) {
1656 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1657 scratch_loc[iter->reg] = last_scratch;
1658 last_scratch += this->alloc.sizes[iter->reg];
1659 }
1660 }
1661 }
1662
1663 for (int i = 0 ; i < 3; i++) {
1664 for (src_reg *iter = &inst->src[i];
1665 iter->reladdr;
1666 iter = iter->reladdr) {
1667 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1668 scratch_loc[iter->reg] = last_scratch;
1669 last_scratch += this->alloc.sizes[iter->reg];
1670 }
1671 }
1672 }
1673 }
1674
1675 /* Now, for anything that will be accessed through scratch, rewrite
1676 * it to load/store. Note that this is a _safe list walk, because
1677 * we may generate a new scratch_write instruction after the one
1678 * we're processing.
1679 */
1680 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1681 /* Set up the annotation tracking for new generated instructions. */
1682 base_ir = inst->ir;
1683 current_annotation = inst->annotation;
1684
1685 /* First handle scratch access on the dst. Notice we have to handle
1686 * the case where the dst's reladdr also points to scratch space.
1687 */
1688 if (inst->dst.reladdr)
1689 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1690 *inst->dst.reladdr);
1691
1692 /* Now that we have handled any (possibly recursive) reladdr scratch
1693 * accesses for dst we can safely do the scratch write for dst itself
1694 */
1695 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1696 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1697
1698 /* Now handle scratch access on any src. In this case, since inst->src[i]
1699 * already is a src_reg, we can just call emit_resolve_reladdr with
1700 * inst->src[i] and it will take care of handling scratch loads for
1701 * both src and src.reladdr (recursively).
1702 */
1703 for (int i = 0 ; i < 3; i++) {
1704 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1705 inst->src[i]);
1706 }
1707 }
1708 }
1709
1710 /**
1711 * Emits an instruction before @inst to load the value named by @orig_src
1712 * from the pull constant buffer (surface) at @base_offset to @temp.
1713 */
1714 void
1715 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1716 dst_reg temp, src_reg orig_src,
1717 int base_offset)
1718 {
1719 int reg_offset = base_offset + orig_src.reg_offset;
1720 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1721 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1722 reg_offset);
1723
1724 emit_pull_constant_load_reg(temp,
1725 index,
1726 offset,
1727 block, inst);
1728 }
1729
1730 /**
1731 * Implements array access of uniforms by inserting a
1732 * PULL_CONSTANT_LOAD instruction.
1733 *
1734 * Unlike temporary GRF array access (where we don't support it due to
1735 * the difficulty of doing relative addressing on instruction
1736 * destinations), we could potentially do array access of uniforms
1737 * that were loaded in GRF space as push constants. In real-world
1738 * usage we've seen, though, the arrays being used are always larger
1739 * than we could load as push constants, so just always move all
1740 * uniform array access out to a pull constant buffer.
1741 */
1742 void
1743 vec4_visitor::move_uniform_array_access_to_pull_constants()
1744 {
1745 int pull_constant_loc[this->uniforms];
1746 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1747 bool nested_reladdr;
1748
1749 /* Walk through and find array access of uniforms. Put a copy of that
1750 * uniform in the pull constant buffer.
1751 *
1752 * Note that we don't move constant-indexed accesses to arrays. No
1753 * testing has been done of the performance impact of this choice.
1754 */
1755 do {
1756 nested_reladdr = false;
1757
1758 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1759 for (int i = 0 ; i < 3; i++) {
1760 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1761 continue;
1762
1763 int uniform = inst->src[i].reg;
1764
1765 if (inst->src[i].reladdr->reladdr)
1766 nested_reladdr = true; /* will need another pass */
1767
1768 /* If this array isn't already present in the pull constant buffer,
1769 * add it.
1770 */
1771 if (pull_constant_loc[uniform] == -1) {
1772 const gl_constant_value **values =
1773 &stage_prog_data->param[uniform * 4];
1774
1775 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1776
1777 assert(uniform < uniform_array_size);
1778 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1779 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1780 = values[j];
1781 }
1782 }
1783
1784 /* Set up the annotation tracking for new generated instructions. */
1785 base_ir = inst->ir;
1786 current_annotation = inst->annotation;
1787
1788 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1789
1790 emit_pull_constant_load(block, inst, temp, inst->src[i],
1791 pull_constant_loc[uniform]);
1792
1793 inst->src[i].file = temp.file;
1794 inst->src[i].reg = temp.reg;
1795 inst->src[i].reg_offset = temp.reg_offset;
1796 inst->src[i].reladdr = NULL;
1797 }
1798 }
1799 } while (nested_reladdr);
1800
1801 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1802 * no need to track them as larger-than-vec4 objects. This will be
1803 * relied on in cutting out unused uniform vectors from push
1804 * constants.
1805 */
1806 split_uniform_registers();
1807 }
1808
1809 void
1810 vec4_visitor::resolve_ud_negate(src_reg *reg)
1811 {
1812 if (reg->type != BRW_REGISTER_TYPE_UD ||
1813 !reg->negate)
1814 return;
1815
1816 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1817 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1818 *reg = temp;
1819 }
1820
1821 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1822 void *log_data,
1823 const struct brw_sampler_prog_key_data *key_tex,
1824 struct brw_vue_prog_data *prog_data,
1825 const nir_shader *shader,
1826 void *mem_ctx,
1827 bool no_spills,
1828 int shader_time_index)
1829 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1830 key_tex(key_tex),
1831 prog_data(prog_data),
1832 fail_msg(NULL),
1833 first_non_payload_grf(0),
1834 need_all_constants_in_pull_buffer(false),
1835 no_spills(no_spills),
1836 shader_time_index(shader_time_index),
1837 last_scratch(0)
1838 {
1839 this->failed = false;
1840
1841 this->base_ir = NULL;
1842 this->current_annotation = NULL;
1843 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1844
1845 this->virtual_grf_start = NULL;
1846 this->virtual_grf_end = NULL;
1847 this->live_intervals = NULL;
1848
1849 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1850
1851 this->uniforms = 0;
1852
1853 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1854 * at least one. See setup_uniforms() in brw_vec4.cpp.
1855 */
1856 this->uniform_array_size = 1;
1857 if (prog_data) {
1858 this->uniform_array_size =
1859 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1860 }
1861
1862 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1863 }
1864
1865 vec4_visitor::~vec4_visitor()
1866 {
1867 }
1868
1869
1870 void
1871 vec4_visitor::fail(const char *format, ...)
1872 {
1873 va_list va;
1874 char *msg;
1875
1876 if (failed)
1877 return;
1878
1879 failed = true;
1880
1881 va_start(va, format);
1882 msg = ralloc_vasprintf(mem_ctx, format, va);
1883 va_end(va);
1884 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1885
1886 this->fail_msg = msg;
1887
1888 if (debug_enabled) {
1889 fprintf(stderr, "%s", msg);
1890 }
1891 }
1892
1893 } /* namespace brw */