Merge remote-tracking branch 'mesa-public/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 #define FIRST_SPILL_MRF(gen) (gen == 6 ? 21 : 13)
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->predicate = BRW_PREDICATE_NONE;
49 this->predicate_inverse = false;
50 this->target = 0;
51 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
52 this->shadow_compare = false;
53 this->ir = NULL;
54 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
55 this->header_size = 0;
56 this->flag_subreg = 0;
57 this->mlen = 0;
58 this->base_mrf = 0;
59 this->offset = 0;
60 this->annotation = NULL;
61 }
62
63 vec4_instruction *
64 vec4_visitor::emit(vec4_instruction *inst)
65 {
66 inst->ir = this->base_ir;
67 inst->annotation = this->current_annotation;
68
69 this->instructions.push_tail(inst);
70
71 return inst;
72 }
73
74 vec4_instruction *
75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
76 vec4_instruction *new_inst)
77 {
78 new_inst->ir = inst->ir;
79 new_inst->annotation = inst->annotation;
80
81 inst->insert_before(block, new_inst);
82
83 return inst;
84 }
85
86 vec4_instruction *
87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
88 const src_reg &src1, const src_reg &src2)
89 {
90 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
91 }
92
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
96 const src_reg &src1)
97 {
98 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
103 {
104 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
109 {
110 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
111 }
112
113 vec4_instruction *
114 vec4_visitor::emit(enum opcode opcode)
115 {
116 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
117 }
118
119 #define ALU1(op) \
120 vec4_instruction * \
121 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
122 { \
123 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
124 }
125
126 #define ALU2(op) \
127 vec4_instruction * \
128 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
129 const src_reg &src1) \
130 { \
131 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
132 src0, src1); \
133 }
134
135 #define ALU2_ACC(op) \
136 vec4_instruction * \
137 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
138 const src_reg &src1) \
139 { \
140 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
141 BRW_OPCODE_##op, dst, src0, src1); \
142 inst->writes_accumulator = true; \
143 return inst; \
144 }
145
146 #define ALU3(op) \
147 vec4_instruction * \
148 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
149 const src_reg &src1, const src_reg &src2) \
150 { \
151 assert(devinfo->gen >= 6); \
152 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
153 src0, src1, src2); \
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU1(F32TO16)
163 ALU1(F16TO32)
164 ALU2(ADD)
165 ALU2(MUL)
166 ALU2_ACC(MACH)
167 ALU2(AND)
168 ALU2(OR)
169 ALU2(XOR)
170 ALU2(DP3)
171 ALU2(DP4)
172 ALU2(DPH)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(MAC)
188
189 /** Gen4 predicated IF. */
190 vec4_instruction *
191 vec4_visitor::IF(enum brw_predicate predicate)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
196 inst->predicate = predicate;
197
198 return inst;
199 }
200
201 /** Gen6 IF with embedded comparison. */
202 vec4_instruction *
203 vec4_visitor::IF(src_reg src0, src_reg src1,
204 enum brw_conditional_mod condition)
205 {
206 assert(devinfo->gen == 6);
207
208 vec4_instruction *inst;
209
210 resolve_ud_negate(&src0);
211 resolve_ud_negate(&src1);
212
213 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
214 src0, src1);
215 inst->conditional_mod = condition;
216
217 return inst;
218 }
219
220 /**
221 * CMP: Sets the low bit of the destination channels with the result
222 * of the comparison, while the upper bits are undefined, and updates
223 * the flag register with the packed 16 bits of the result.
224 */
225 vec4_instruction *
226 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
227 enum brw_conditional_mod condition)
228 {
229 vec4_instruction *inst;
230
231 /* Take the instruction:
232 *
233 * CMP null<d> src0<f> src1<f>
234 *
235 * Original gen4 does type conversion to the destination type before
236 * comparison, producing garbage results for floating point comparisons.
237 *
238 * The destination type doesn't matter on newer generations, so we set the
239 * type to match src0 so we can compact the instruction.
240 */
241 dst.type = src0.type;
242 if (dst.file == HW_REG)
243 dst.fixed_hw_reg.type = dst.type;
244
245 resolve_ud_negate(&src0);
246 resolve_ud_negate(&src1);
247
248 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
249 inst->conditional_mod = condition;
250
251 return inst;
252 }
253
254 vec4_instruction *
255 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
256 {
257 vec4_instruction *inst;
258
259 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
260 dst, index);
261 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
262 inst->mlen = 2;
263
264 return inst;
265 }
266
267 vec4_instruction *
268 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
269 const src_reg &index)
270 {
271 vec4_instruction *inst;
272
273 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
274 dst, src, index);
275 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
276 inst->mlen = 3;
277
278 return inst;
279 }
280
281 void
282 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
283 {
284 static enum opcode dot_opcodes[] = {
285 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
286 };
287
288 emit(dot_opcodes[elements - 2], dst, src0, src1);
289 }
290
291 src_reg
292 vec4_visitor::fix_3src_operand(const src_reg &src)
293 {
294 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
295 * able to use vertical stride of zero to replicate the vec4 uniform, like
296 *
297 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
298 *
299 * But you can't, since vertical stride is always four in three-source
300 * instructions. Instead, insert a MOV instruction to do the replication so
301 * that the three-source instruction can consume it.
302 */
303
304 /* The MOV is only needed if the source is a uniform or immediate. */
305 if (src.file != UNIFORM && src.file != IMM)
306 return src;
307
308 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
309 return src;
310
311 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
312 expanded.type = src.type;
313 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
314 return src_reg(expanded);
315 }
316
317 src_reg
318 vec4_visitor::resolve_source_modifiers(const src_reg &src)
319 {
320 if (!src.abs && !src.negate)
321 return src;
322
323 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
324 resolved.type = src.type;
325 emit(MOV(resolved, src));
326
327 return src_reg(resolved);
328 }
329
330 src_reg
331 vec4_visitor::fix_math_operand(const src_reg &src)
332 {
333 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
334 return src;
335
336 /* The gen6 math instruction ignores the source modifiers --
337 * swizzle, abs, negate, and at least some parts of the register
338 * region description.
339 *
340 * Rather than trying to enumerate all these cases, *always* expand the
341 * operand to a temp GRF for gen6.
342 *
343 * For gen7, keep the operand as-is, except if immediate, which gen7 still
344 * can't use.
345 */
346
347 if (devinfo->gen == 7 && src.file != IMM)
348 return src;
349
350 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
351 expanded.type = src.type;
352 emit(MOV(expanded, src));
353 return src_reg(expanded);
354 }
355
356 vec4_instruction *
357 vec4_visitor::emit_math(enum opcode opcode,
358 const dst_reg &dst,
359 const src_reg &src0, const src_reg &src1)
360 {
361 vec4_instruction *math =
362 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
363
364 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
365 /* MATH on Gen6 must be align1, so we can't do writemasks. */
366 math->dst = dst_reg(this, glsl_type::vec4_type);
367 math->dst.type = dst.type;
368 math = emit(MOV(dst, src_reg(math->dst)));
369 } else if (devinfo->gen < 6) {
370 math->base_mrf = 1;
371 math->mlen = src1.file == BAD_FILE ? 1 : 2;
372 }
373
374 return math;
375 }
376
377 void
378 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
379 {
380 if (devinfo->gen < 7) {
381 unreachable("ir_unop_pack_half_2x16 should be lowered");
382 }
383
384 assert(dst.type == BRW_REGISTER_TYPE_UD);
385 assert(src0.type == BRW_REGISTER_TYPE_F);
386
387 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
388 *
389 * Because this instruction does not have a 16-bit floating-point type,
390 * the destination data type must be Word (W).
391 *
392 * The destination must be DWord-aligned and specify a horizontal stride
393 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
394 * each destination channel and the upper word is not modified.
395 *
396 * The above restriction implies that the f32to16 instruction must use
397 * align1 mode, because only in align1 mode is it possible to specify
398 * horizontal stride. We choose here to defy the hardware docs and emit
399 * align16 instructions.
400 *
401 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
402 * instructions. I was partially successful in that the code passed all
403 * tests. However, the code was dubiously correct and fragile, and the
404 * tests were not harsh enough to probe that frailty. Not trusting the
405 * code, I chose instead to remain in align16 mode in defiance of the hw
406 * docs).
407 *
408 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
409 * simulator, emitting a f32to16 in align16 mode with UD as destination
410 * data type is safe. The behavior differs from that specified in the PRM
411 * in that the upper word of each destination channel is cleared to 0.
412 */
413
414 dst_reg tmp_dst(this, glsl_type::uvec2_type);
415 src_reg tmp_src(tmp_dst);
416
417 #if 0
418 /* Verify the undocumented behavior on which the following instructions
419 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
420 * then the result of the bit-or instruction below will be incorrect.
421 *
422 * You should inspect the disasm output in order to verify that the MOV is
423 * not optimized away.
424 */
425 emit(MOV(tmp_dst, src_reg(0x12345678u)));
426 #endif
427
428 /* Give tmp the form below, where "." means untouched.
429 *
430 * w z y x w z y x
431 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
432 *
433 * That the upper word of each write-channel be 0 is required for the
434 * following bit-shift and bit-or instructions to work. Note that this
435 * relies on the undocumented hardware behavior mentioned above.
436 */
437 tmp_dst.writemask = WRITEMASK_XY;
438 emit(F32TO16(tmp_dst, src0));
439
440 /* Give the write-channels of dst the form:
441 * 0xhhhh0000
442 */
443 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
444 emit(SHL(dst, tmp_src, src_reg(16u)));
445
446 /* Finally, give the write-channels of dst the form of packHalf2x16's
447 * output:
448 * 0xhhhhllll
449 */
450 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
451 emit(OR(dst, src_reg(dst), tmp_src));
452 }
453
454 void
455 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
456 {
457 if (devinfo->gen < 7) {
458 unreachable("ir_unop_unpack_half_2x16 should be lowered");
459 }
460
461 assert(dst.type == BRW_REGISTER_TYPE_F);
462 assert(src0.type == BRW_REGISTER_TYPE_UD);
463
464 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
465 *
466 * Because this instruction does not have a 16-bit floating-point type,
467 * the source data type must be Word (W). The destination type must be
468 * F (Float).
469 *
470 * To use W as the source data type, we must adjust horizontal strides,
471 * which is only possible in align1 mode. All my [chadv] attempts at
472 * emitting align1 instructions for unpackHalf2x16 failed to pass the
473 * Piglit tests, so I gave up.
474 *
475 * I've verified that, on gen7 hardware and the simulator, it is safe to
476 * emit f16to32 in align16 mode with UD as source data type.
477 */
478
479 dst_reg tmp_dst(this, glsl_type::uvec2_type);
480 src_reg tmp_src(tmp_dst);
481
482 tmp_dst.writemask = WRITEMASK_X;
483 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
484
485 tmp_dst.writemask = WRITEMASK_Y;
486 emit(SHR(tmp_dst, src0, src_reg(16u)));
487
488 dst.writemask = WRITEMASK_XY;
489 emit(F16TO32(dst, tmp_src));
490 }
491
492 void
493 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
494 {
495 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
496 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
497 * is not suitable to generate the shift values, but we can use the packed
498 * vector float and a type-converting MOV.
499 */
500 dst_reg shift(this, glsl_type::uvec4_type);
501 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
502
503 dst_reg shifted(this, glsl_type::uvec4_type);
504 src0.swizzle = BRW_SWIZZLE_XXXX;
505 emit(SHR(shifted, src0, src_reg(shift)));
506
507 shifted.type = BRW_REGISTER_TYPE_UB;
508 dst_reg f(this, glsl_type::vec4_type);
509 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
510
511 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
512 }
513
514 void
515 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
516 {
517 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
518 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
519 * is not suitable to generate the shift values, but we can use the packed
520 * vector float and a type-converting MOV.
521 */
522 dst_reg shift(this, glsl_type::uvec4_type);
523 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
524
525 dst_reg shifted(this, glsl_type::uvec4_type);
526 src0.swizzle = BRW_SWIZZLE_XXXX;
527 emit(SHR(shifted, src0, src_reg(shift)));
528
529 shifted.type = BRW_REGISTER_TYPE_B;
530 dst_reg f(this, glsl_type::vec4_type);
531 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
532
533 dst_reg scaled(this, glsl_type::vec4_type);
534 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
535
536 dst_reg max(this, glsl_type::vec4_type);
537 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
538 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
539 }
540
541 void
542 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
543 {
544 dst_reg saturated(this, glsl_type::vec4_type);
545 vec4_instruction *inst = emit(MOV(saturated, src0));
546 inst->saturate = true;
547
548 dst_reg scaled(this, glsl_type::vec4_type);
549 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
550
551 dst_reg rounded(this, glsl_type::vec4_type);
552 emit(RNDE(rounded, src_reg(scaled)));
553
554 dst_reg u(this, glsl_type::uvec4_type);
555 emit(MOV(u, src_reg(rounded)));
556
557 src_reg bytes(u);
558 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
559 }
560
561 void
562 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
563 {
564 dst_reg max(this, glsl_type::vec4_type);
565 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
566
567 dst_reg min(this, glsl_type::vec4_type);
568 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
569
570 dst_reg scaled(this, glsl_type::vec4_type);
571 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
572
573 dst_reg rounded(this, glsl_type::vec4_type);
574 emit(RNDE(rounded, src_reg(scaled)));
575
576 dst_reg i(this, glsl_type::ivec4_type);
577 emit(MOV(i, src_reg(rounded)));
578
579 src_reg bytes(i);
580 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
581 }
582
583 /**
584 * Returns the minimum number of vec4 elements needed to pack a type.
585 *
586 * For simple types, it will return 1 (a single vec4); for matrices, the
587 * number of columns; for array and struct, the sum of the vec4_size of
588 * each of its elements; and for sampler and atomic, zero.
589 *
590 * This method is useful to calculate how much register space is needed to
591 * store a particular type.
592 */
593 extern "C" int
594 type_size_vec4(const struct glsl_type *type)
595 {
596 unsigned int i;
597 int size;
598
599 switch (type->base_type) {
600 case GLSL_TYPE_UINT:
601 case GLSL_TYPE_INT:
602 case GLSL_TYPE_FLOAT:
603 case GLSL_TYPE_BOOL:
604 if (type->is_matrix()) {
605 return type->matrix_columns;
606 } else {
607 /* Regardless of size of vector, it gets a vec4. This is bad
608 * packing for things like floats, but otherwise arrays become a
609 * mess. Hopefully a later pass over the code can pack scalars
610 * down if appropriate.
611 */
612 return 1;
613 }
614 case GLSL_TYPE_ARRAY:
615 assert(type->length > 0);
616 return type_size_vec4(type->fields.array) * type->length;
617 case GLSL_TYPE_STRUCT:
618 size = 0;
619 for (i = 0; i < type->length; i++) {
620 size += type_size_vec4(type->fields.structure[i].type);
621 }
622 return size;
623 case GLSL_TYPE_SUBROUTINE:
624 return 1;
625
626 case GLSL_TYPE_SAMPLER:
627 /* Samplers take up no register space, since they're baked in at
628 * link time.
629 */
630 return 0;
631 case GLSL_TYPE_ATOMIC_UINT:
632 return 0;
633 case GLSL_TYPE_IMAGE:
634 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
635 case GLSL_TYPE_VOID:
636 case GLSL_TYPE_DOUBLE:
637 case GLSL_TYPE_ERROR:
638 case GLSL_TYPE_INTERFACE:
639 case GLSL_TYPE_FUNCTION:
640 unreachable("not reached");
641 }
642
643 return 0;
644 }
645
646 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
647 {
648 init();
649
650 this->file = GRF;
651 this->reg = v->alloc.allocate(type_size_vec4(type));
652
653 if (type->is_array() || type->is_record()) {
654 this->swizzle = BRW_SWIZZLE_NOOP;
655 } else {
656 this->swizzle = brw_swizzle_for_size(type->vector_elements);
657 }
658
659 this->type = brw_type_for_base_type(type);
660 }
661
662 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
663 {
664 assert(size > 0);
665
666 init();
667
668 this->file = GRF;
669 this->reg = v->alloc.allocate(type_size_vec4(type) * size);
670
671 this->swizzle = BRW_SWIZZLE_NOOP;
672
673 this->type = brw_type_for_base_type(type);
674 }
675
676 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
677 {
678 init();
679
680 this->file = GRF;
681 this->reg = v->alloc.allocate(type_size_vec4(type));
682
683 if (type->is_array() || type->is_record()) {
684 this->writemask = WRITEMASK_XYZW;
685 } else {
686 this->writemask = (1 << type->vector_elements) - 1;
687 }
688
689 this->type = brw_type_for_base_type(type);
690 }
691
692 vec4_instruction *
693 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
694 src_reg src0, src_reg src1)
695 {
696 vec4_instruction *inst;
697
698 if (devinfo->gen >= 6) {
699 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
700 inst->conditional_mod = conditionalmod;
701 } else {
702 emit(CMP(dst, src0, src1, conditionalmod));
703
704 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
705 inst->predicate = BRW_PREDICATE_NORMAL;
706 }
707
708 return inst;
709 }
710
711 vec4_instruction *
712 vec4_visitor::emit_lrp(const dst_reg &dst,
713 const src_reg &x, const src_reg &y, const src_reg &a)
714 {
715 if (devinfo->gen >= 6) {
716 /* Note that the instruction's argument order is reversed from GLSL
717 * and the IR.
718 */
719 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
720 fix_3src_operand(x)));
721 } else {
722 /* Earlier generations don't support three source operations, so we
723 * need to emit x*(1-a) + y*a.
724 */
725 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
726 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
727 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
728 y_times_a.writemask = dst.writemask;
729 one_minus_a.writemask = dst.writemask;
730 x_times_one_minus_a.writemask = dst.writemask;
731
732 emit(MUL(y_times_a, y, a));
733 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
734 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
735 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
736 }
737 }
738
739 /**
740 * Emits the instructions needed to perform a pull constant load. before_block
741 * and before_inst can be NULL in which case the instruction will be appended
742 * to the end of the instruction list.
743 */
744 void
745 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
746 src_reg surf_index,
747 src_reg offset_reg,
748 bblock_t *before_block,
749 vec4_instruction *before_inst)
750 {
751 assert((before_inst == NULL && before_block == NULL) ||
752 (before_inst && before_block));
753
754 vec4_instruction *pull;
755
756 if (devinfo->gen >= 9) {
757 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
758 src_reg header(this, glsl_type::uvec4_type, 2);
759
760 pull = new(mem_ctx)
761 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
762 dst_reg(header));
763
764 if (before_inst)
765 emit_before(before_block, before_inst, pull);
766 else
767 emit(pull);
768
769 dst_reg index_reg = retype(offset(dst_reg(header), 1),
770 offset_reg.type);
771 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
772
773 if (before_inst)
774 emit_before(before_block, before_inst, pull);
775 else
776 emit(pull);
777
778 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
779 dst,
780 surf_index,
781 header);
782 pull->mlen = 2;
783 pull->header_size = 1;
784 } else if (devinfo->gen >= 7) {
785 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
786
787 grf_offset.type = offset_reg.type;
788
789 pull = MOV(grf_offset, offset_reg);
790
791 if (before_inst)
792 emit_before(before_block, before_inst, pull);
793 else
794 emit(pull);
795
796 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
797 dst,
798 surf_index,
799 src_reg(grf_offset));
800 pull->mlen = 1;
801 } else {
802 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
803 dst,
804 surf_index,
805 offset_reg);
806 pull->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
807 pull->mlen = 1;
808 }
809
810 if (before_inst)
811 emit_before(before_block, before_inst, pull);
812 else
813 emit(pull);
814 }
815
816 src_reg
817 vec4_visitor::emit_uniformize(const src_reg &src)
818 {
819 const src_reg chan_index(this, glsl_type::uint_type);
820 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
821 src.type);
822
823 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
824 ->force_writemask_all = true;
825 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
826 ->force_writemask_all = true;
827
828 return src_reg(dst);
829 }
830
831 src_reg
832 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
833 src_reg coordinate, src_reg sampler)
834 {
835 vec4_instruction *inst =
836 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
837 dst_reg(this, glsl_type::uvec4_type));
838 inst->base_mrf = 2;
839 inst->src[1] = sampler;
840
841 int param_base;
842
843 if (devinfo->gen >= 9) {
844 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
845 vec4_instruction *header_inst = new(mem_ctx)
846 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
847 dst_reg(MRF, inst->base_mrf));
848
849 emit(header_inst);
850
851 inst->mlen = 2;
852 inst->header_size = 1;
853 param_base = inst->base_mrf + 1;
854 } else {
855 inst->mlen = 1;
856 param_base = inst->base_mrf;
857 }
858
859 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
860 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
861 int zero_mask = 0xf & ~coord_mask;
862
863 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
864 coordinate));
865
866 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
867 src_reg(0)));
868
869 emit(inst);
870 return src_reg(inst->dst);
871 }
872
873 bool
874 vec4_visitor::is_high_sampler(src_reg sampler)
875 {
876 if (devinfo->gen < 8 && !devinfo->is_haswell)
877 return false;
878
879 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
880 }
881
882 void
883 vec4_visitor::emit_texture(ir_texture_opcode op,
884 dst_reg dest,
885 const glsl_type *dest_type,
886 src_reg coordinate,
887 int coord_components,
888 src_reg shadow_comparitor,
889 src_reg lod, src_reg lod2,
890 src_reg sample_index,
891 uint32_t constant_offset,
892 src_reg offset_value,
893 src_reg mcs,
894 bool is_cube_array,
895 uint32_t sampler,
896 src_reg sampler_reg)
897 {
898 enum opcode opcode;
899 switch (op) {
900 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
901 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
902 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
903 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
904 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
905 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
906 case ir_tg4: opcode = offset_value.file != BAD_FILE
907 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
908 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
909 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
910 case ir_txb:
911 unreachable("TXB is not valid for vertex shaders.");
912 case ir_lod:
913 unreachable("LOD is not valid for vertex shaders.");
914 default:
915 unreachable("Unrecognized tex op");
916 }
917
918 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
919 opcode, dst_reg(this, dest_type));
920
921 inst->offset = constant_offset;
922
923 /* The message header is necessary for:
924 * - Gen4 (always)
925 * - Gen9+ for selecting SIMD4x2
926 * - Texel offsets
927 * - Gather channel selection
928 * - Sampler indices too large to fit in a 4-bit value.
929 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
930 */
931 inst->header_size =
932 (devinfo->gen < 5 || devinfo->gen >= 9 ||
933 inst->offset != 0 || op == ir_tg4 ||
934 op == ir_texture_samples ||
935 is_high_sampler(sampler_reg)) ? 1 : 0;
936 inst->base_mrf = 2;
937 inst->mlen = inst->header_size;
938 inst->dst.writemask = WRITEMASK_XYZW;
939 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
940
941 inst->src[1] = sampler_reg;
942
943 /* MRF for the first parameter */
944 int param_base = inst->base_mrf + inst->header_size;
945
946 if (op == ir_txs || op == ir_query_levels) {
947 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
948 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
949 inst->mlen++;
950 } else if (op == ir_texture_samples) {
951 inst->dst.writemask = WRITEMASK_X;
952 } else {
953 /* Load the coordinate */
954 /* FINISHME: gl_clamp_mask and saturate */
955 int coord_mask = (1 << coord_components) - 1;
956 int zero_mask = 0xf & ~coord_mask;
957
958 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
959 coordinate));
960 inst->mlen++;
961
962 if (zero_mask != 0) {
963 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
964 src_reg(0)));
965 }
966 /* Load the shadow comparitor */
967 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
968 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
969 WRITEMASK_X),
970 shadow_comparitor));
971 inst->mlen++;
972 }
973
974 /* Load the LOD info */
975 if (op == ir_tex || op == ir_txl) {
976 int mrf, writemask;
977 if (devinfo->gen >= 5) {
978 mrf = param_base + 1;
979 if (shadow_comparitor.file != BAD_FILE) {
980 writemask = WRITEMASK_Y;
981 /* mlen already incremented */
982 } else {
983 writemask = WRITEMASK_X;
984 inst->mlen++;
985 }
986 } else /* devinfo->gen == 4 */ {
987 mrf = param_base;
988 writemask = WRITEMASK_W;
989 }
990 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
991 } else if (op == ir_txf) {
992 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
993 } else if (op == ir_txf_ms) {
994 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
995 sample_index));
996 if (devinfo->gen >= 7) {
997 /* MCS data is in the first channel of `mcs`, but we need to get it into
998 * the .y channel of the second vec4 of params, so replicate .x across
999 * the whole vec4 and then mask off everything except .y
1000 */
1001 mcs.swizzle = BRW_SWIZZLE_XXXX;
1002 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1003 mcs));
1004 }
1005 inst->mlen++;
1006 } else if (op == ir_txd) {
1007 const brw_reg_type type = lod.type;
1008
1009 if (devinfo->gen >= 5) {
1010 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1011 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1012 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1013 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1014 inst->mlen++;
1015
1016 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1017 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1018 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1019 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1020 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1021 inst->mlen++;
1022
1023 if (shadow_comparitor.file != BAD_FILE) {
1024 emit(MOV(dst_reg(MRF, param_base + 2,
1025 shadow_comparitor.type, WRITEMASK_Z),
1026 shadow_comparitor));
1027 }
1028 }
1029 } else /* devinfo->gen == 4 */ {
1030 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1031 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1032 inst->mlen += 2;
1033 }
1034 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1035 if (shadow_comparitor.file != BAD_FILE) {
1036 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1037 shadow_comparitor));
1038 }
1039
1040 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1041 offset_value));
1042 inst->mlen++;
1043 }
1044 }
1045
1046 emit(inst);
1047
1048 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1049 * spec requires layers.
1050 */
1051 if (op == ir_txs && is_cube_array) {
1052 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1053 writemask(inst->dst, WRITEMASK_Z),
1054 src_reg(inst->dst), src_reg(6));
1055 }
1056
1057 if (devinfo->gen == 6 && op == ir_tg4) {
1058 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1059 }
1060
1061 swizzle_result(op, dest,
1062 src_reg(inst->dst), sampler, dest_type);
1063 }
1064
1065 /**
1066 * Apply workarounds for Gen6 gather with UINT/SINT
1067 */
1068 void
1069 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1070 {
1071 if (!wa)
1072 return;
1073
1074 int width = (wa & WA_8BIT) ? 8 : 16;
1075 dst_reg dst_f = dst;
1076 dst_f.type = BRW_REGISTER_TYPE_F;
1077
1078 /* Convert from UNORM to UINT */
1079 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1080 emit(MOV(dst, src_reg(dst_f)));
1081
1082 if (wa & WA_SIGN) {
1083 /* Reinterpret the UINT value as a signed INT value by
1084 * shifting the sign bit into place, then shifting back
1085 * preserving sign.
1086 */
1087 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1088 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1089 }
1090 }
1091
1092 /**
1093 * Set up the gather channel based on the swizzle, for gather4.
1094 */
1095 uint32_t
1096 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1097 {
1098 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1099 switch (swiz) {
1100 case SWIZZLE_X: return 0;
1101 case SWIZZLE_Y:
1102 /* gather4 sampler is broken for green channel on RG32F --
1103 * we must ask for blue instead.
1104 */
1105 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1106 return 2;
1107 return 1;
1108 case SWIZZLE_Z: return 2;
1109 case SWIZZLE_W: return 3;
1110 default:
1111 unreachable("Not reached"); /* zero, one swizzles handled already */
1112 }
1113 }
1114
1115 void
1116 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1117 src_reg orig_val, uint32_t sampler,
1118 const glsl_type *dest_type)
1119 {
1120 int s = key_tex->swizzles[sampler];
1121
1122 dst_reg swizzled_result = dest;
1123
1124 if (op == ir_query_levels) {
1125 /* # levels is in .w */
1126 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1127 emit(MOV(swizzled_result, orig_val));
1128 return;
1129 }
1130
1131 if (op == ir_txs || dest_type == glsl_type::float_type
1132 || s == SWIZZLE_NOOP || op == ir_tg4) {
1133 emit(MOV(swizzled_result, orig_val));
1134 return;
1135 }
1136
1137
1138 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1139 int swizzle[4] = {0};
1140
1141 for (int i = 0; i < 4; i++) {
1142 switch (GET_SWZ(s, i)) {
1143 case SWIZZLE_ZERO:
1144 zero_mask |= (1 << i);
1145 break;
1146 case SWIZZLE_ONE:
1147 one_mask |= (1 << i);
1148 break;
1149 default:
1150 copy_mask |= (1 << i);
1151 swizzle[i] = GET_SWZ(s, i);
1152 break;
1153 }
1154 }
1155
1156 if (copy_mask) {
1157 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1158 swizzled_result.writemask = copy_mask;
1159 emit(MOV(swizzled_result, orig_val));
1160 }
1161
1162 if (zero_mask) {
1163 swizzled_result.writemask = zero_mask;
1164 emit(MOV(swizzled_result, src_reg(0.0f)));
1165 }
1166
1167 if (one_mask) {
1168 swizzled_result.writemask = one_mask;
1169 emit(MOV(swizzled_result, src_reg(1.0f)));
1170 }
1171 }
1172
1173 void
1174 vec4_visitor::gs_emit_vertex(int stream_id)
1175 {
1176 unreachable("not reached");
1177 }
1178
1179 void
1180 vec4_visitor::gs_end_primitive()
1181 {
1182 unreachable("not reached");
1183 }
1184
1185 void
1186 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1187 dst_reg dst, src_reg offset,
1188 src_reg src0, src_reg src1)
1189 {
1190 unsigned mlen = 0;
1191
1192 /* Set the atomic operation offset. */
1193 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
1194 mlen++;
1195
1196 /* Set the atomic operation arguments. */
1197 if (src0.file != BAD_FILE) {
1198 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
1199 mlen++;
1200 }
1201
1202 if (src1.file != BAD_FILE) {
1203 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
1204 mlen++;
1205 }
1206
1207 /* Emit the instruction. Note that this maps to the normal SIMD8
1208 * untyped atomic message on Ivy Bridge, but that's OK because
1209 * unused channels will be masked out.
1210 */
1211 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1212 brw_message_reg(0),
1213 src_reg(surf_index), src_reg(atomic_op));
1214 inst->mlen = mlen;
1215 }
1216
1217 void
1218 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1219 src_reg offset)
1220 {
1221 /* Set the surface read offset. */
1222 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
1223
1224 /* Emit the instruction. Note that this maps to the normal SIMD8
1225 * untyped surface read message, but that's OK because unused
1226 * channels will be masked out.
1227 */
1228 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1229 brw_message_reg(0),
1230 src_reg(surf_index), src_reg(1));
1231 inst->mlen = 1;
1232 }
1233
1234 void
1235 vec4_visitor::emit_ndc_computation()
1236 {
1237 /* Get the position */
1238 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1239
1240 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1241 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1242 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1243
1244 current_annotation = "NDC";
1245 dst_reg ndc_w = ndc;
1246 ndc_w.writemask = WRITEMASK_W;
1247 src_reg pos_w = pos;
1248 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1249 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1250
1251 dst_reg ndc_xyz = ndc;
1252 ndc_xyz.writemask = WRITEMASK_XYZ;
1253
1254 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1255 }
1256
1257 void
1258 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1259 {
1260 if (devinfo->gen < 6 &&
1261 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1262 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1263 devinfo->has_negative_rhw_bug)) {
1264 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1265 dst_reg header1_w = header1;
1266 header1_w.writemask = WRITEMASK_W;
1267
1268 emit(MOV(header1, 0u));
1269
1270 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1271 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1272
1273 current_annotation = "Point size";
1274 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1275 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1276 }
1277
1278 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1279 current_annotation = "Clipping flags";
1280 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1281 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1282
1283 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1284 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1285 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1286
1287 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1288 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1289 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1290 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1291 }
1292
1293 /* i965 clipping workaround:
1294 * 1) Test for -ve rhw
1295 * 2) If set,
1296 * set ndc = (0,0,0,0)
1297 * set ucp[6] = 1
1298 *
1299 * Later, clipping will detect ucp[6] and ensure the primitive is
1300 * clipped against all fixed planes.
1301 */
1302 if (devinfo->has_negative_rhw_bug) {
1303 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1304 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1305 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1306 vec4_instruction *inst;
1307 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1308 inst->predicate = BRW_PREDICATE_NORMAL;
1309 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1310 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1311 inst->predicate = BRW_PREDICATE_NORMAL;
1312 }
1313
1314 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1315 } else if (devinfo->gen < 6) {
1316 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1317 } else {
1318 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1319 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1320 dst_reg reg_w = reg;
1321 reg_w.writemask = WRITEMASK_W;
1322 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1323 reg_as_src.type = reg_w.type;
1324 reg_as_src.swizzle = brw_swizzle_for_size(1);
1325 emit(MOV(reg_w, reg_as_src));
1326 }
1327 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1328 dst_reg reg_y = reg;
1329 reg_y.writemask = WRITEMASK_Y;
1330 reg_y.type = BRW_REGISTER_TYPE_D;
1331 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1332 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1333 }
1334 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1335 dst_reg reg_z = reg;
1336 reg_z.writemask = WRITEMASK_Z;
1337 reg_z.type = BRW_REGISTER_TYPE_D;
1338 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1339 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1340 }
1341 }
1342 }
1343
1344 vec4_instruction *
1345 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1346 {
1347 assert(varying < VARYING_SLOT_MAX);
1348 assert(output_reg[varying].type == reg.type);
1349 current_annotation = output_reg_annotation[varying];
1350 /* Copy the register, saturating if necessary */
1351 return emit(MOV(reg, src_reg(output_reg[varying])));
1352 }
1353
1354 void
1355 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1356 {
1357 reg.type = BRW_REGISTER_TYPE_F;
1358 output_reg[varying].type = reg.type;
1359
1360 switch (varying) {
1361 case VARYING_SLOT_PSIZ:
1362 {
1363 /* PSIZ is always in slot 0, and is coupled with other flags. */
1364 current_annotation = "indices, point width, clip flags";
1365 emit_psiz_and_flags(reg);
1366 break;
1367 }
1368 case BRW_VARYING_SLOT_NDC:
1369 current_annotation = "NDC";
1370 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1371 break;
1372 case VARYING_SLOT_POS:
1373 current_annotation = "gl_Position";
1374 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1375 break;
1376 case VARYING_SLOT_EDGE:
1377 /* This is present when doing unfilled polygons. We're supposed to copy
1378 * the edge flag from the user-provided vertex array
1379 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1380 * of that attribute (starts as 1.0f). This is then used in clipping to
1381 * determine which edges should be drawn as wireframe.
1382 */
1383 current_annotation = "edge flag";
1384 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1385 glsl_type::float_type, WRITEMASK_XYZW))));
1386 break;
1387 case BRW_VARYING_SLOT_PAD:
1388 /* No need to write to this slot */
1389 break;
1390 default:
1391 emit_generic_urb_slot(reg, varying);
1392 break;
1393 }
1394 }
1395
1396 static int
1397 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1398 {
1399 if (devinfo->gen >= 6) {
1400 /* URB data written (does not include the message header reg) must
1401 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1402 * section 5.4.3.2.2: URB_INTERLEAVED.
1403 *
1404 * URB entries are allocated on a multiple of 1024 bits, so an
1405 * extra 128 bits written here to make the end align to 256 is
1406 * no problem.
1407 */
1408 if ((mlen % 2) != 1)
1409 mlen++;
1410 }
1411
1412 return mlen;
1413 }
1414
1415
1416 /**
1417 * Generates the VUE payload plus the necessary URB write instructions to
1418 * output it.
1419 *
1420 * The VUE layout is documented in Volume 2a.
1421 */
1422 void
1423 vec4_visitor::emit_vertex()
1424 {
1425 /* MRF 0 is reserved for the debugger, so start with message header
1426 * in MRF 1.
1427 */
1428 int base_mrf = 1;
1429 int mrf = base_mrf;
1430 /* In the process of generating our URB write message contents, we
1431 * may need to unspill a register or load from an array. Those
1432 * reads would use MRFs 14-15.
1433 */
1434 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1435
1436 /* The following assertion verifies that max_usable_mrf causes an
1437 * even-numbered amount of URB write data, which will meet gen6's
1438 * requirements for length alignment.
1439 */
1440 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1441
1442 /* First mrf is the g0-based message header containing URB handles and
1443 * such.
1444 */
1445 emit_urb_write_header(mrf++);
1446
1447 if (devinfo->gen < 6) {
1448 emit_ndc_computation();
1449 }
1450
1451 /* We may need to split this up into several URB writes, so do them in a
1452 * loop.
1453 */
1454 int slot = 0;
1455 bool complete = false;
1456 do {
1457 /* URB offset is in URB row increments, and each of our MRFs is half of
1458 * one of those, since we're doing interleaved writes.
1459 */
1460 int offset = slot / 2;
1461
1462 mrf = base_mrf + 1;
1463 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1464 emit_urb_slot(dst_reg(MRF, mrf++),
1465 prog_data->vue_map.slot_to_varying[slot]);
1466
1467 /* If this was max_usable_mrf, we can't fit anything more into this
1468 * URB WRITE. Same thing if we reached the maximum length available.
1469 */
1470 if (mrf > max_usable_mrf ||
1471 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1472 slot++;
1473 break;
1474 }
1475 }
1476
1477 complete = slot >= prog_data->vue_map.num_slots;
1478 current_annotation = "URB write";
1479 vec4_instruction *inst = emit_urb_write_opcode(complete);
1480 inst->base_mrf = base_mrf;
1481 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1482 inst->offset += offset;
1483 } while(!complete);
1484 }
1485
1486
1487 src_reg
1488 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1489 src_reg *reladdr, int reg_offset)
1490 {
1491 /* Because we store the values to scratch interleaved like our
1492 * vertex data, we need to scale the vec4 index by 2.
1493 */
1494 int message_header_scale = 2;
1495
1496 /* Pre-gen6, the message header uses byte offsets instead of vec4
1497 * (16-byte) offset units.
1498 */
1499 if (devinfo->gen < 6)
1500 message_header_scale *= 16;
1501
1502 if (reladdr) {
1503 src_reg index = src_reg(this, glsl_type::int_type);
1504
1505 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1506 src_reg(reg_offset)));
1507 emit_before(block, inst, MUL(dst_reg(index), index,
1508 src_reg(message_header_scale)));
1509
1510 return index;
1511 } else {
1512 return src_reg(reg_offset * message_header_scale);
1513 }
1514 }
1515
1516 src_reg
1517 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1518 src_reg *reladdr, int reg_offset)
1519 {
1520 if (reladdr) {
1521 src_reg index = src_reg(this, glsl_type::int_type);
1522
1523 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1524 src_reg(reg_offset)));
1525
1526 /* Pre-gen6, the message header uses byte offsets instead of vec4
1527 * (16-byte) offset units.
1528 */
1529 if (devinfo->gen < 6) {
1530 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1531 }
1532
1533 return index;
1534 } else if (devinfo->gen >= 8) {
1535 /* Store the offset in a GRF so we can send-from-GRF. */
1536 src_reg offset = src_reg(this, glsl_type::int_type);
1537 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1538 return offset;
1539 } else {
1540 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1541 return src_reg(reg_offset * message_header_scale);
1542 }
1543 }
1544
1545 /**
1546 * Emits an instruction before @inst to load the value named by @orig_src
1547 * from scratch space at @base_offset to @temp.
1548 *
1549 * @base_offset is measured in 32-byte units (the size of a register).
1550 */
1551 void
1552 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1553 dst_reg temp, src_reg orig_src,
1554 int base_offset)
1555 {
1556 int reg_offset = base_offset + orig_src.reg_offset;
1557 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1558 reg_offset);
1559
1560 emit_before(block, inst, SCRATCH_READ(temp, index));
1561 }
1562
1563 /**
1564 * Emits an instruction after @inst to store the value to be written
1565 * to @orig_dst to scratch space at @base_offset, from @temp.
1566 *
1567 * @base_offset is measured in 32-byte units (the size of a register).
1568 */
1569 void
1570 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1571 int base_offset)
1572 {
1573 int reg_offset = base_offset + inst->dst.reg_offset;
1574 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1575 reg_offset);
1576
1577 /* Create a temporary register to store *inst's result in.
1578 *
1579 * We have to be careful in MOVing from our temporary result register in
1580 * the scratch write. If we swizzle from channels of the temporary that
1581 * weren't initialized, it will confuse live interval analysis, which will
1582 * make spilling fail to make progress.
1583 */
1584 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1585 inst->dst.type),
1586 brw_swizzle_for_mask(inst->dst.writemask));
1587 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1588 inst->dst.writemask));
1589 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1590 if (inst->opcode != BRW_OPCODE_SEL)
1591 write->predicate = inst->predicate;
1592 write->ir = inst->ir;
1593 write->annotation = inst->annotation;
1594 inst->insert_after(block, write);
1595
1596 inst->dst.file = temp.file;
1597 inst->dst.reg = temp.reg;
1598 inst->dst.reg_offset = temp.reg_offset;
1599 inst->dst.reladdr = NULL;
1600 }
1601
1602 /**
1603 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1604 * adds the scratch read(s) before \p inst. The function also checks for
1605 * recursive reladdr scratch accesses, issuing the corresponding scratch
1606 * loads and rewriting reladdr references accordingly.
1607 *
1608 * \return \p src if it did not require a scratch load, otherwise, the
1609 * register holding the result of the scratch load that the caller should
1610 * use to rewrite src.
1611 */
1612 src_reg
1613 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1614 vec4_instruction *inst, src_reg src)
1615 {
1616 /* Resolve recursive reladdr scratch access by calling ourselves
1617 * with src.reladdr
1618 */
1619 if (src.reladdr)
1620 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1621 *src.reladdr);
1622
1623 /* Now handle scratch access on src */
1624 if (src.file == GRF && scratch_loc[src.reg] != -1) {
1625 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1626 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1627 src.reg = temp.reg;
1628 src.reg_offset = temp.reg_offset;
1629 src.reladdr = NULL;
1630 }
1631
1632 return src;
1633 }
1634
1635 /**
1636 * We can't generally support array access in GRF space, because a
1637 * single instruction's destination can only span 2 contiguous
1638 * registers. So, we send all GRF arrays that get variable index
1639 * access to scratch space.
1640 */
1641 void
1642 vec4_visitor::move_grf_array_access_to_scratch()
1643 {
1644 int scratch_loc[this->alloc.count];
1645 memset(scratch_loc, -1, sizeof(scratch_loc));
1646
1647 /* First, calculate the set of virtual GRFs that need to be punted
1648 * to scratch due to having any array access on them, and where in
1649 * scratch.
1650 */
1651 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1652 if (inst->dst.file == GRF && inst->dst.reladdr) {
1653 if (scratch_loc[inst->dst.reg] == -1) {
1654 scratch_loc[inst->dst.reg] = last_scratch;
1655 last_scratch += this->alloc.sizes[inst->dst.reg];
1656 }
1657
1658 for (src_reg *iter = inst->dst.reladdr;
1659 iter->reladdr;
1660 iter = iter->reladdr) {
1661 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1662 scratch_loc[iter->reg] = last_scratch;
1663 last_scratch += this->alloc.sizes[iter->reg];
1664 }
1665 }
1666 }
1667
1668 for (int i = 0 ; i < 3; i++) {
1669 for (src_reg *iter = &inst->src[i];
1670 iter->reladdr;
1671 iter = iter->reladdr) {
1672 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1673 scratch_loc[iter->reg] = last_scratch;
1674 last_scratch += this->alloc.sizes[iter->reg];
1675 }
1676 }
1677 }
1678 }
1679
1680 /* Now, for anything that will be accessed through scratch, rewrite
1681 * it to load/store. Note that this is a _safe list walk, because
1682 * we may generate a new scratch_write instruction after the one
1683 * we're processing.
1684 */
1685 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1686 /* Set up the annotation tracking for new generated instructions. */
1687 base_ir = inst->ir;
1688 current_annotation = inst->annotation;
1689
1690 /* First handle scratch access on the dst. Notice we have to handle
1691 * the case where the dst's reladdr also points to scratch space.
1692 */
1693 if (inst->dst.reladdr)
1694 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1695 *inst->dst.reladdr);
1696
1697 /* Now that we have handled any (possibly recursive) reladdr scratch
1698 * accesses for dst we can safely do the scratch write for dst itself
1699 */
1700 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1701 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1702
1703 /* Now handle scratch access on any src. In this case, since inst->src[i]
1704 * already is a src_reg, we can just call emit_resolve_reladdr with
1705 * inst->src[i] and it will take care of handling scratch loads for
1706 * both src and src.reladdr (recursively).
1707 */
1708 for (int i = 0 ; i < 3; i++) {
1709 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1710 inst->src[i]);
1711 }
1712 }
1713 }
1714
1715 /**
1716 * Emits an instruction before @inst to load the value named by @orig_src
1717 * from the pull constant buffer (surface) at @base_offset to @temp.
1718 */
1719 void
1720 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1721 dst_reg temp, src_reg orig_src,
1722 int base_offset)
1723 {
1724 int reg_offset = base_offset + orig_src.reg_offset;
1725 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1726 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1727 reg_offset);
1728
1729 emit_pull_constant_load_reg(temp,
1730 index,
1731 offset,
1732 block, inst);
1733 }
1734
1735 /**
1736 * Implements array access of uniforms by inserting a
1737 * PULL_CONSTANT_LOAD instruction.
1738 *
1739 * Unlike temporary GRF array access (where we don't support it due to
1740 * the difficulty of doing relative addressing on instruction
1741 * destinations), we could potentially do array access of uniforms
1742 * that were loaded in GRF space as push constants. In real-world
1743 * usage we've seen, though, the arrays being used are always larger
1744 * than we could load as push constants, so just always move all
1745 * uniform array access out to a pull constant buffer.
1746 */
1747 void
1748 vec4_visitor::move_uniform_array_access_to_pull_constants()
1749 {
1750 int pull_constant_loc[this->uniforms];
1751 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1752 bool nested_reladdr;
1753
1754 /* Walk through and find array access of uniforms. Put a copy of that
1755 * uniform in the pull constant buffer.
1756 *
1757 * Note that we don't move constant-indexed accesses to arrays. No
1758 * testing has been done of the performance impact of this choice.
1759 */
1760 do {
1761 nested_reladdr = false;
1762
1763 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1764 for (int i = 0 ; i < 3; i++) {
1765 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1766 continue;
1767
1768 int uniform = inst->src[i].reg;
1769
1770 if (inst->src[i].reladdr->reladdr)
1771 nested_reladdr = true; /* will need another pass */
1772
1773 /* If this array isn't already present in the pull constant buffer,
1774 * add it.
1775 */
1776 if (pull_constant_loc[uniform] == -1) {
1777 const gl_constant_value **values =
1778 &stage_prog_data->param[uniform * 4];
1779
1780 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1781
1782 assert(uniform < uniform_array_size);
1783 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1784 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1785 = values[j];
1786 }
1787 }
1788
1789 /* Set up the annotation tracking for new generated instructions. */
1790 base_ir = inst->ir;
1791 current_annotation = inst->annotation;
1792
1793 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1794
1795 emit_pull_constant_load(block, inst, temp, inst->src[i],
1796 pull_constant_loc[uniform]);
1797
1798 inst->src[i].file = temp.file;
1799 inst->src[i].reg = temp.reg;
1800 inst->src[i].reg_offset = temp.reg_offset;
1801 inst->src[i].reladdr = NULL;
1802 }
1803 }
1804 } while (nested_reladdr);
1805
1806 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1807 * no need to track them as larger-than-vec4 objects. This will be
1808 * relied on in cutting out unused uniform vectors from push
1809 * constants.
1810 */
1811 split_uniform_registers();
1812 }
1813
1814 void
1815 vec4_visitor::resolve_ud_negate(src_reg *reg)
1816 {
1817 if (reg->type != BRW_REGISTER_TYPE_UD ||
1818 !reg->negate)
1819 return;
1820
1821 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1822 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1823 *reg = temp;
1824 }
1825
1826 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1827 void *log_data,
1828 const struct brw_sampler_prog_key_data *key_tex,
1829 struct brw_vue_prog_data *prog_data,
1830 nir_shader *shader,
1831 void *mem_ctx,
1832 bool no_spills,
1833 int shader_time_index)
1834 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1835 key_tex(key_tex),
1836 prog_data(prog_data),
1837 fail_msg(NULL),
1838 first_non_payload_grf(0),
1839 need_all_constants_in_pull_buffer(false),
1840 no_spills(no_spills),
1841 shader_time_index(shader_time_index),
1842 last_scratch(0)
1843 {
1844 this->failed = false;
1845
1846 this->base_ir = NULL;
1847 this->current_annotation = NULL;
1848 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1849
1850 this->virtual_grf_start = NULL;
1851 this->virtual_grf_end = NULL;
1852 this->live_intervals = NULL;
1853
1854 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1855
1856 this->uniforms = 0;
1857
1858 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1859 * at least one. See setup_uniforms() in brw_vec4.cpp.
1860 */
1861 this->uniform_array_size = 1;
1862 if (prog_data) {
1863 this->uniform_array_size =
1864 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1865 }
1866
1867 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1868 }
1869
1870 vec4_visitor::~vec4_visitor()
1871 {
1872 }
1873
1874
1875 void
1876 vec4_visitor::fail(const char *format, ...)
1877 {
1878 va_list va;
1879 char *msg;
1880
1881 if (failed)
1882 return;
1883
1884 failed = true;
1885
1886 va_start(va, format);
1887 msg = ralloc_vasprintf(mem_ctx, format, va);
1888 va_end(va);
1889 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1890
1891 this->fail_msg = msg;
1892
1893 if (debug_enabled) {
1894 fprintf(stderr, "%s", msg);
1895 }
1896 }
1897
1898 } /* namespace brw */