i965/vec4/skl+: Use ld2dms_w instead of ld2dms
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 #include "program/sampler.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240 if (dst.file == HW_REG)
241 dst.fixed_hw_reg.type = dst.type;
242
243 resolve_ud_negate(&src0);
244 resolve_ud_negate(&src1);
245
246 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
247 inst->conditional_mod = condition;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
258 dst, index);
259 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
260 inst->mlen = 2;
261
262 return inst;
263 }
264
265 vec4_instruction *
266 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
267 const src_reg &index)
268 {
269 vec4_instruction *inst;
270
271 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
272 dst, src, index);
273 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
274 inst->mlen = 3;
275
276 return inst;
277 }
278
279 src_reg
280 vec4_visitor::fix_3src_operand(const src_reg &src)
281 {
282 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
283 * able to use vertical stride of zero to replicate the vec4 uniform, like
284 *
285 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
286 *
287 * But you can't, since vertical stride is always four in three-source
288 * instructions. Instead, insert a MOV instruction to do the replication so
289 * that the three-source instruction can consume it.
290 */
291
292 /* The MOV is only needed if the source is a uniform or immediate. */
293 if (src.file != UNIFORM && src.file != IMM)
294 return src;
295
296 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
297 return src;
298
299 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
300 expanded.type = src.type;
301 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
302 return src_reg(expanded);
303 }
304
305 src_reg
306 vec4_visitor::resolve_source_modifiers(const src_reg &src)
307 {
308 if (!src.abs && !src.negate)
309 return src;
310
311 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
312 resolved.type = src.type;
313 emit(MOV(resolved, src));
314
315 return src_reg(resolved);
316 }
317
318 src_reg
319 vec4_visitor::fix_math_operand(const src_reg &src)
320 {
321 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
322 return src;
323
324 /* The gen6 math instruction ignores the source modifiers --
325 * swizzle, abs, negate, and at least some parts of the register
326 * region description.
327 *
328 * Rather than trying to enumerate all these cases, *always* expand the
329 * operand to a temp GRF for gen6.
330 *
331 * For gen7, keep the operand as-is, except if immediate, which gen7 still
332 * can't use.
333 */
334
335 if (devinfo->gen == 7 && src.file != IMM)
336 return src;
337
338 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
339 expanded.type = src.type;
340 emit(MOV(expanded, src));
341 return src_reg(expanded);
342 }
343
344 vec4_instruction *
345 vec4_visitor::emit_math(enum opcode opcode,
346 const dst_reg &dst,
347 const src_reg &src0, const src_reg &src1)
348 {
349 vec4_instruction *math =
350 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
351
352 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
353 /* MATH on Gen6 must be align1, so we can't do writemasks. */
354 math->dst = dst_reg(this, glsl_type::vec4_type);
355 math->dst.type = dst.type;
356 math = emit(MOV(dst, src_reg(math->dst)));
357 } else if (devinfo->gen < 6) {
358 math->base_mrf = 1;
359 math->mlen = src1.file == BAD_FILE ? 1 : 2;
360 }
361
362 return math;
363 }
364
365 void
366 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
367 {
368 if (devinfo->gen < 7) {
369 unreachable("ir_unop_pack_half_2x16 should be lowered");
370 }
371
372 assert(dst.type == BRW_REGISTER_TYPE_UD);
373 assert(src0.type == BRW_REGISTER_TYPE_F);
374
375 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
376 *
377 * Because this instruction does not have a 16-bit floating-point type,
378 * the destination data type must be Word (W).
379 *
380 * The destination must be DWord-aligned and specify a horizontal stride
381 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
382 * each destination channel and the upper word is not modified.
383 *
384 * The above restriction implies that the f32to16 instruction must use
385 * align1 mode, because only in align1 mode is it possible to specify
386 * horizontal stride. We choose here to defy the hardware docs and emit
387 * align16 instructions.
388 *
389 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
390 * instructions. I was partially successful in that the code passed all
391 * tests. However, the code was dubiously correct and fragile, and the
392 * tests were not harsh enough to probe that frailty. Not trusting the
393 * code, I chose instead to remain in align16 mode in defiance of the hw
394 * docs).
395 *
396 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
397 * simulator, emitting a f32to16 in align16 mode with UD as destination
398 * data type is safe. The behavior differs from that specified in the PRM
399 * in that the upper word of each destination channel is cleared to 0.
400 */
401
402 dst_reg tmp_dst(this, glsl_type::uvec2_type);
403 src_reg tmp_src(tmp_dst);
404
405 #if 0
406 /* Verify the undocumented behavior on which the following instructions
407 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
408 * then the result of the bit-or instruction below will be incorrect.
409 *
410 * You should inspect the disasm output in order to verify that the MOV is
411 * not optimized away.
412 */
413 emit(MOV(tmp_dst, src_reg(0x12345678u)));
414 #endif
415
416 /* Give tmp the form below, where "." means untouched.
417 *
418 * w z y x w z y x
419 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
420 *
421 * That the upper word of each write-channel be 0 is required for the
422 * following bit-shift and bit-or instructions to work. Note that this
423 * relies on the undocumented hardware behavior mentioned above.
424 */
425 tmp_dst.writemask = WRITEMASK_XY;
426 emit(F32TO16(tmp_dst, src0));
427
428 /* Give the write-channels of dst the form:
429 * 0xhhhh0000
430 */
431 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
432 emit(SHL(dst, tmp_src, src_reg(16u)));
433
434 /* Finally, give the write-channels of dst the form of packHalf2x16's
435 * output:
436 * 0xhhhhllll
437 */
438 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
439 emit(OR(dst, src_reg(dst), tmp_src));
440 }
441
442 void
443 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
444 {
445 if (devinfo->gen < 7) {
446 unreachable("ir_unop_unpack_half_2x16 should be lowered");
447 }
448
449 assert(dst.type == BRW_REGISTER_TYPE_F);
450 assert(src0.type == BRW_REGISTER_TYPE_UD);
451
452 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
453 *
454 * Because this instruction does not have a 16-bit floating-point type,
455 * the source data type must be Word (W). The destination type must be
456 * F (Float).
457 *
458 * To use W as the source data type, we must adjust horizontal strides,
459 * which is only possible in align1 mode. All my [chadv] attempts at
460 * emitting align1 instructions for unpackHalf2x16 failed to pass the
461 * Piglit tests, so I gave up.
462 *
463 * I've verified that, on gen7 hardware and the simulator, it is safe to
464 * emit f16to32 in align16 mode with UD as source data type.
465 */
466
467 dst_reg tmp_dst(this, glsl_type::uvec2_type);
468 src_reg tmp_src(tmp_dst);
469
470 tmp_dst.writemask = WRITEMASK_X;
471 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
472
473 tmp_dst.writemask = WRITEMASK_Y;
474 emit(SHR(tmp_dst, src0, src_reg(16u)));
475
476 dst.writemask = WRITEMASK_XY;
477 emit(F16TO32(dst, tmp_src));
478 }
479
480 void
481 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
482 {
483 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
484 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
485 * is not suitable to generate the shift values, but we can use the packed
486 * vector float and a type-converting MOV.
487 */
488 dst_reg shift(this, glsl_type::uvec4_type);
489 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
490
491 dst_reg shifted(this, glsl_type::uvec4_type);
492 src0.swizzle = BRW_SWIZZLE_XXXX;
493 emit(SHR(shifted, src0, src_reg(shift)));
494
495 shifted.type = BRW_REGISTER_TYPE_UB;
496 dst_reg f(this, glsl_type::vec4_type);
497 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
498
499 emit(MUL(dst, src_reg(f), src_reg(1.0f / 255.0f)));
500 }
501
502 void
503 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
504 {
505 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
506 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
507 * is not suitable to generate the shift values, but we can use the packed
508 * vector float and a type-converting MOV.
509 */
510 dst_reg shift(this, glsl_type::uvec4_type);
511 emit(MOV(shift, src_reg(0x00, 0x60, 0x70, 0x78)));
512
513 dst_reg shifted(this, glsl_type::uvec4_type);
514 src0.swizzle = BRW_SWIZZLE_XXXX;
515 emit(SHR(shifted, src0, src_reg(shift)));
516
517 shifted.type = BRW_REGISTER_TYPE_B;
518 dst_reg f(this, glsl_type::vec4_type);
519 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
520
521 dst_reg scaled(this, glsl_type::vec4_type);
522 emit(MUL(scaled, src_reg(f), src_reg(1.0f / 127.0f)));
523
524 dst_reg max(this, glsl_type::vec4_type);
525 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), src_reg(-1.0f));
526 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), src_reg(1.0f));
527 }
528
529 void
530 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
531 {
532 dst_reg saturated(this, glsl_type::vec4_type);
533 vec4_instruction *inst = emit(MOV(saturated, src0));
534 inst->saturate = true;
535
536 dst_reg scaled(this, glsl_type::vec4_type);
537 emit(MUL(scaled, src_reg(saturated), src_reg(255.0f)));
538
539 dst_reg rounded(this, glsl_type::vec4_type);
540 emit(RNDE(rounded, src_reg(scaled)));
541
542 dst_reg u(this, glsl_type::uvec4_type);
543 emit(MOV(u, src_reg(rounded)));
544
545 src_reg bytes(u);
546 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
547 }
548
549 void
550 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
551 {
552 dst_reg max(this, glsl_type::vec4_type);
553 emit_minmax(BRW_CONDITIONAL_GE, max, src0, src_reg(-1.0f));
554
555 dst_reg min(this, glsl_type::vec4_type);
556 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), src_reg(1.0f));
557
558 dst_reg scaled(this, glsl_type::vec4_type);
559 emit(MUL(scaled, src_reg(min), src_reg(127.0f)));
560
561 dst_reg rounded(this, glsl_type::vec4_type);
562 emit(RNDE(rounded, src_reg(scaled)));
563
564 dst_reg i(this, glsl_type::ivec4_type);
565 emit(MOV(i, src_reg(rounded)));
566
567 src_reg bytes(i);
568 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
569 }
570
571 /**
572 * Returns the minimum number of vec4 elements needed to pack a type.
573 *
574 * For simple types, it will return 1 (a single vec4); for matrices, the
575 * number of columns; for array and struct, the sum of the vec4_size of
576 * each of its elements; and for sampler and atomic, zero.
577 *
578 * This method is useful to calculate how much register space is needed to
579 * store a particular type.
580 */
581 extern "C" int
582 type_size_vec4(const struct glsl_type *type)
583 {
584 unsigned int i;
585 int size;
586
587 switch (type->base_type) {
588 case GLSL_TYPE_UINT:
589 case GLSL_TYPE_INT:
590 case GLSL_TYPE_FLOAT:
591 case GLSL_TYPE_BOOL:
592 if (type->is_matrix()) {
593 return type->matrix_columns;
594 } else {
595 /* Regardless of size of vector, it gets a vec4. This is bad
596 * packing for things like floats, but otherwise arrays become a
597 * mess. Hopefully a later pass over the code can pack scalars
598 * down if appropriate.
599 */
600 return 1;
601 }
602 case GLSL_TYPE_ARRAY:
603 assert(type->length > 0);
604 return type_size_vec4(type->fields.array) * type->length;
605 case GLSL_TYPE_STRUCT:
606 size = 0;
607 for (i = 0; i < type->length; i++) {
608 size += type_size_vec4(type->fields.structure[i].type);
609 }
610 return size;
611 case GLSL_TYPE_SUBROUTINE:
612 return 1;
613
614 case GLSL_TYPE_SAMPLER:
615 /* Samplers take up no register space, since they're baked in at
616 * link time.
617 */
618 return 0;
619 case GLSL_TYPE_ATOMIC_UINT:
620 return 0;
621 case GLSL_TYPE_IMAGE:
622 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
623 case GLSL_TYPE_VOID:
624 case GLSL_TYPE_DOUBLE:
625 case GLSL_TYPE_ERROR:
626 case GLSL_TYPE_INTERFACE:
627 unreachable("not reached");
628 }
629
630 return 0;
631 }
632
633 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
634 {
635 init();
636
637 this->file = GRF;
638 this->reg = v->alloc.allocate(type_size_vec4(type));
639
640 if (type->is_array() || type->is_record()) {
641 this->swizzle = BRW_SWIZZLE_NOOP;
642 } else {
643 this->swizzle = brw_swizzle_for_size(type->vector_elements);
644 }
645
646 this->type = brw_type_for_base_type(type);
647 }
648
649 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
650 {
651 assert(size > 0);
652
653 init();
654
655 this->file = GRF;
656 this->reg = v->alloc.allocate(type_size_vec4(type) * size);
657
658 this->swizzle = BRW_SWIZZLE_NOOP;
659
660 this->type = brw_type_for_base_type(type);
661 }
662
663 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
664 {
665 init();
666
667 this->file = GRF;
668 this->reg = v->alloc.allocate(type_size_vec4(type));
669
670 if (type->is_array() || type->is_record()) {
671 this->writemask = WRITEMASK_XYZW;
672 } else {
673 this->writemask = (1 << type->vector_elements) - 1;
674 }
675
676 this->type = brw_type_for_base_type(type);
677 }
678
679 vec4_instruction *
680 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
681 src_reg src0, src_reg src1)
682 {
683 vec4_instruction *inst;
684
685 if (devinfo->gen >= 6) {
686 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
687 inst->conditional_mod = conditionalmod;
688 } else {
689 emit(CMP(dst, src0, src1, conditionalmod));
690
691 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
692 inst->predicate = BRW_PREDICATE_NORMAL;
693 }
694
695 return inst;
696 }
697
698 vec4_instruction *
699 vec4_visitor::emit_lrp(const dst_reg &dst,
700 const src_reg &x, const src_reg &y, const src_reg &a)
701 {
702 if (devinfo->gen >= 6) {
703 /* Note that the instruction's argument order is reversed from GLSL
704 * and the IR.
705 */
706 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
707 fix_3src_operand(x)));
708 } else {
709 /* Earlier generations don't support three source operations, so we
710 * need to emit x*(1-a) + y*a.
711 */
712 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
713 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
714 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
715 y_times_a.writemask = dst.writemask;
716 one_minus_a.writemask = dst.writemask;
717 x_times_one_minus_a.writemask = dst.writemask;
718
719 emit(MUL(y_times_a, y, a));
720 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
721 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
722 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
723 }
724 }
725
726 /**
727 * Emits the instructions needed to perform a pull constant load. before_block
728 * and before_inst can be NULL in which case the instruction will be appended
729 * to the end of the instruction list.
730 */
731 void
732 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
733 src_reg surf_index,
734 src_reg offset_reg,
735 bblock_t *before_block,
736 vec4_instruction *before_inst)
737 {
738 assert((before_inst == NULL && before_block == NULL) ||
739 (before_inst && before_block));
740
741 vec4_instruction *pull;
742
743 if (devinfo->gen >= 9) {
744 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
745 src_reg header(this, glsl_type::uvec4_type, 2);
746
747 pull = new(mem_ctx)
748 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
749 dst_reg(header));
750
751 if (before_inst)
752 emit_before(before_block, before_inst, pull);
753 else
754 emit(pull);
755
756 dst_reg index_reg = retype(offset(dst_reg(header), 1),
757 offset_reg.type);
758 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
759
760 if (before_inst)
761 emit_before(before_block, before_inst, pull);
762 else
763 emit(pull);
764
765 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
766 dst,
767 surf_index,
768 header);
769 pull->mlen = 2;
770 pull->header_size = 1;
771 } else if (devinfo->gen >= 7) {
772 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
773
774 grf_offset.type = offset_reg.type;
775
776 pull = MOV(grf_offset, offset_reg);
777
778 if (before_inst)
779 emit_before(before_block, before_inst, pull);
780 else
781 emit(pull);
782
783 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
784 dst,
785 surf_index,
786 src_reg(grf_offset));
787 pull->mlen = 1;
788 } else {
789 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
790 dst,
791 surf_index,
792 offset_reg);
793 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
794 pull->mlen = 1;
795 }
796
797 if (before_inst)
798 emit_before(before_block, before_inst, pull);
799 else
800 emit(pull);
801 }
802
803 src_reg
804 vec4_visitor::emit_uniformize(const src_reg &src)
805 {
806 const src_reg chan_index(this, glsl_type::uint_type);
807 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
808 src.type);
809
810 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
811 ->force_writemask_all = true;
812 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
813 ->force_writemask_all = true;
814
815 return src_reg(dst);
816 }
817
818 src_reg
819 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
820 src_reg coordinate, src_reg sampler)
821 {
822 vec4_instruction *inst =
823 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
824 dst_reg(this, glsl_type::uvec4_type));
825 inst->base_mrf = 2;
826 inst->src[1] = sampler;
827
828 int param_base;
829
830 if (devinfo->gen >= 9) {
831 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
832 vec4_instruction *header_inst = new(mem_ctx)
833 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
834 dst_reg(MRF, inst->base_mrf));
835
836 emit(header_inst);
837
838 inst->mlen = 2;
839 inst->header_size = 1;
840 param_base = inst->base_mrf + 1;
841 } else {
842 inst->mlen = 1;
843 param_base = inst->base_mrf;
844 }
845
846 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
847 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
848 int zero_mask = 0xf & ~coord_mask;
849
850 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
851 coordinate));
852
853 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
854 src_reg(0)));
855
856 emit(inst);
857 return src_reg(inst->dst);
858 }
859
860 bool
861 vec4_visitor::is_high_sampler(src_reg sampler)
862 {
863 if (devinfo->gen < 8 && !devinfo->is_haswell)
864 return false;
865
866 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
867 }
868
869 void
870 vec4_visitor::emit_texture(ir_texture_opcode op,
871 dst_reg dest,
872 const glsl_type *dest_type,
873 src_reg coordinate,
874 int coord_components,
875 src_reg shadow_comparitor,
876 src_reg lod, src_reg lod2,
877 src_reg sample_index,
878 uint32_t constant_offset,
879 src_reg offset_value,
880 src_reg mcs,
881 bool is_cube_array,
882 uint32_t sampler,
883 src_reg sampler_reg)
884 {
885 /* The sampler can only meaningfully compute LOD for fragment shader
886 * messages. For all other stages, we change the opcode to TXL and hardcode
887 * the LOD to 0.
888 *
889 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
890 * valid LOD argument.
891 */
892 if (op == ir_tex || op == ir_query_levels) {
893 assert(lod.file == BAD_FILE);
894 lod = src_reg(0.0f);
895 }
896
897 enum opcode opcode;
898 switch (op) {
899 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
900 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
901 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
902 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
903 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
904 SHADER_OPCODE_TXF_CMS); break;
905 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
906 case ir_tg4: opcode = offset_value.file != BAD_FILE
907 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
908 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
909 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
910 case ir_txb:
911 unreachable("TXB is not valid for vertex shaders.");
912 case ir_lod:
913 unreachable("LOD is not valid for vertex shaders.");
914 default:
915 unreachable("Unrecognized tex op");
916 }
917
918 vec4_instruction *inst = new(mem_ctx) vec4_instruction(
919 opcode, dst_reg(this, dest_type));
920
921 inst->offset = constant_offset;
922
923 /* The message header is necessary for:
924 * - Gen4 (always)
925 * - Gen9+ for selecting SIMD4x2
926 * - Texel offsets
927 * - Gather channel selection
928 * - Sampler indices too large to fit in a 4-bit value.
929 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
930 */
931 inst->header_size =
932 (devinfo->gen < 5 || devinfo->gen >= 9 ||
933 inst->offset != 0 || op == ir_tg4 ||
934 op == ir_texture_samples ||
935 is_high_sampler(sampler_reg)) ? 1 : 0;
936 inst->base_mrf = 2;
937 inst->mlen = inst->header_size;
938 inst->dst.writemask = WRITEMASK_XYZW;
939 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
940
941 inst->src[1] = sampler_reg;
942
943 /* MRF for the first parameter */
944 int param_base = inst->base_mrf + inst->header_size;
945
946 if (op == ir_txs || op == ir_query_levels) {
947 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
948 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
949 inst->mlen++;
950 } else if (op == ir_texture_samples) {
951 inst->dst.writemask = WRITEMASK_X;
952 } else {
953 /* Load the coordinate */
954 /* FINISHME: gl_clamp_mask and saturate */
955 int coord_mask = (1 << coord_components) - 1;
956 int zero_mask = 0xf & ~coord_mask;
957
958 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
959 coordinate));
960 inst->mlen++;
961
962 if (zero_mask != 0) {
963 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
964 src_reg(0)));
965 }
966 /* Load the shadow comparitor */
967 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
968 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
969 WRITEMASK_X),
970 shadow_comparitor));
971 inst->mlen++;
972 }
973
974 /* Load the LOD info */
975 if (op == ir_tex || op == ir_txl) {
976 int mrf, writemask;
977 if (devinfo->gen >= 5) {
978 mrf = param_base + 1;
979 if (shadow_comparitor.file != BAD_FILE) {
980 writemask = WRITEMASK_Y;
981 /* mlen already incremented */
982 } else {
983 writemask = WRITEMASK_X;
984 inst->mlen++;
985 }
986 } else /* devinfo->gen == 4 */ {
987 mrf = param_base;
988 writemask = WRITEMASK_W;
989 }
990 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
991 } else if (op == ir_txf) {
992 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
993 } else if (op == ir_txf_ms) {
994 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
995 sample_index));
996 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
997 /* MCS data is stored in the first two channels of ‘mcs’, but we
998 * need to get it into the .y and .z channels of the second vec4
999 * of params.
1000 */
1001 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1002 emit(MOV(dst_reg(MRF, param_base + 1,
1003 glsl_type::uint_type, WRITEMASK_YZ),
1004 mcs));
1005 } else if (devinfo->gen >= 7) {
1006 /* MCS data is in the first channel of `mcs`, but we need to get it into
1007 * the .y channel of the second vec4 of params, so replicate .x across
1008 * the whole vec4 and then mask off everything except .y
1009 */
1010 mcs.swizzle = BRW_SWIZZLE_XXXX;
1011 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1012 mcs));
1013 }
1014 inst->mlen++;
1015 } else if (op == ir_txd) {
1016 const brw_reg_type type = lod.type;
1017
1018 if (devinfo->gen >= 5) {
1019 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1020 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1021 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1022 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1023 inst->mlen++;
1024
1025 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1026 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1027 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1028 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1029 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1030 inst->mlen++;
1031
1032 if (shadow_comparitor.file != BAD_FILE) {
1033 emit(MOV(dst_reg(MRF, param_base + 2,
1034 shadow_comparitor.type, WRITEMASK_Z),
1035 shadow_comparitor));
1036 }
1037 }
1038 } else /* devinfo->gen == 4 */ {
1039 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1040 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1041 inst->mlen += 2;
1042 }
1043 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1044 if (shadow_comparitor.file != BAD_FILE) {
1045 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1046 shadow_comparitor));
1047 }
1048
1049 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1050 offset_value));
1051 inst->mlen++;
1052 }
1053 }
1054
1055 emit(inst);
1056
1057 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1058 * spec requires layers.
1059 */
1060 if (op == ir_txs && is_cube_array) {
1061 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1062 writemask(inst->dst, WRITEMASK_Z),
1063 src_reg(inst->dst), src_reg(6));
1064 }
1065
1066 if (devinfo->gen == 6 && op == ir_tg4) {
1067 emit_gen6_gather_wa(key_tex->gen6_gather_wa[sampler], inst->dst);
1068 }
1069
1070 swizzle_result(op, dest,
1071 src_reg(inst->dst), sampler, dest_type);
1072 }
1073
1074 /**
1075 * Apply workarounds for Gen6 gather with UINT/SINT
1076 */
1077 void
1078 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1079 {
1080 if (!wa)
1081 return;
1082
1083 int width = (wa & WA_8BIT) ? 8 : 16;
1084 dst_reg dst_f = dst;
1085 dst_f.type = BRW_REGISTER_TYPE_F;
1086
1087 /* Convert from UNORM to UINT */
1088 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
1089 emit(MOV(dst, src_reg(dst_f)));
1090
1091 if (wa & WA_SIGN) {
1092 /* Reinterpret the UINT value as a signed INT value by
1093 * shifting the sign bit into place, then shifting back
1094 * preserving sign.
1095 */
1096 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
1097 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
1098 }
1099 }
1100
1101 /**
1102 * Set up the gather channel based on the swizzle, for gather4.
1103 */
1104 uint32_t
1105 vec4_visitor::gather_channel(unsigned gather_component, uint32_t sampler)
1106 {
1107 int swiz = GET_SWZ(key_tex->swizzles[sampler], gather_component);
1108 switch (swiz) {
1109 case SWIZZLE_X: return 0;
1110 case SWIZZLE_Y:
1111 /* gather4 sampler is broken for green channel on RG32F --
1112 * we must ask for blue instead.
1113 */
1114 if (key_tex->gather_channel_quirk_mask & (1 << sampler))
1115 return 2;
1116 return 1;
1117 case SWIZZLE_Z: return 2;
1118 case SWIZZLE_W: return 3;
1119 default:
1120 unreachable("Not reached"); /* zero, one swizzles handled already */
1121 }
1122 }
1123
1124 void
1125 vec4_visitor::swizzle_result(ir_texture_opcode op, dst_reg dest,
1126 src_reg orig_val, uint32_t sampler,
1127 const glsl_type *dest_type)
1128 {
1129 int s = key_tex->swizzles[sampler];
1130
1131 dst_reg swizzled_result = dest;
1132
1133 if (op == ir_query_levels) {
1134 /* # levels is in .w */
1135 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1136 emit(MOV(swizzled_result, orig_val));
1137 return;
1138 }
1139
1140 if (op == ir_txs || dest_type == glsl_type::float_type
1141 || s == SWIZZLE_NOOP || op == ir_tg4) {
1142 emit(MOV(swizzled_result, orig_val));
1143 return;
1144 }
1145
1146
1147 int zero_mask = 0, one_mask = 0, copy_mask = 0;
1148 int swizzle[4] = {0};
1149
1150 for (int i = 0; i < 4; i++) {
1151 switch (GET_SWZ(s, i)) {
1152 case SWIZZLE_ZERO:
1153 zero_mask |= (1 << i);
1154 break;
1155 case SWIZZLE_ONE:
1156 one_mask |= (1 << i);
1157 break;
1158 default:
1159 copy_mask |= (1 << i);
1160 swizzle[i] = GET_SWZ(s, i);
1161 break;
1162 }
1163 }
1164
1165 if (copy_mask) {
1166 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1167 swizzled_result.writemask = copy_mask;
1168 emit(MOV(swizzled_result, orig_val));
1169 }
1170
1171 if (zero_mask) {
1172 swizzled_result.writemask = zero_mask;
1173 emit(MOV(swizzled_result, src_reg(0.0f)));
1174 }
1175
1176 if (one_mask) {
1177 swizzled_result.writemask = one_mask;
1178 emit(MOV(swizzled_result, src_reg(1.0f)));
1179 }
1180 }
1181
1182 void
1183 vec4_visitor::gs_emit_vertex(int stream_id)
1184 {
1185 unreachable("not reached");
1186 }
1187
1188 void
1189 vec4_visitor::gs_end_primitive()
1190 {
1191 unreachable("not reached");
1192 }
1193
1194 void
1195 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1196 dst_reg dst, src_reg surf_offset,
1197 src_reg src0, src_reg src1)
1198 {
1199 unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1200 src_reg src_payload(this, glsl_type::uint_type, mlen);
1201 dst_reg payload(src_payload);
1202 payload.writemask = WRITEMASK_X;
1203
1204 /* Set the atomic operation offset. */
1205 emit(MOV(offset(payload, 0), surf_offset));
1206 unsigned i = 1;
1207
1208 /* Set the atomic operation arguments. */
1209 if (src0.file != BAD_FILE) {
1210 emit(MOV(offset(payload, i), src0));
1211 i++;
1212 }
1213
1214 if (src1.file != BAD_FILE) {
1215 emit(MOV(offset(payload, i), src1));
1216 i++;
1217 }
1218
1219 /* Emit the instruction. Note that this maps to the normal SIMD8
1220 * untyped atomic message on Ivy Bridge, but that's OK because
1221 * unused channels will be masked out.
1222 */
1223 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1224 src_payload,
1225 src_reg(surf_index), src_reg(atomic_op));
1226 inst->mlen = mlen;
1227 }
1228
1229 void
1230 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1231 src_reg surf_offset)
1232 {
1233 dst_reg offset(this, glsl_type::uint_type);
1234 offset.writemask = WRITEMASK_X;
1235
1236 /* Set the surface read offset. */
1237 emit(MOV(offset, surf_offset));
1238
1239 /* Emit the instruction. Note that this maps to the normal SIMD8
1240 * untyped surface read message, but that's OK because unused
1241 * channels will be masked out.
1242 */
1243 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1244 src_reg(offset),
1245 src_reg(surf_index), src_reg(1));
1246 inst->mlen = 1;
1247 }
1248
1249 void
1250 vec4_visitor::emit_ndc_computation()
1251 {
1252 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1253 return;
1254
1255 /* Get the position */
1256 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1257
1258 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1259 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1260 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1261
1262 current_annotation = "NDC";
1263 dst_reg ndc_w = ndc;
1264 ndc_w.writemask = WRITEMASK_W;
1265 src_reg pos_w = pos;
1266 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1267 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1268
1269 dst_reg ndc_xyz = ndc;
1270 ndc_xyz.writemask = WRITEMASK_XYZ;
1271
1272 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1273 }
1274
1275 void
1276 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1277 {
1278 if (devinfo->gen < 6 &&
1279 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1280 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1281 devinfo->has_negative_rhw_bug)) {
1282 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1283 dst_reg header1_w = header1;
1284 header1_w.writemask = WRITEMASK_W;
1285
1286 emit(MOV(header1, 0u));
1287
1288 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1289 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1290
1291 current_annotation = "Point size";
1292 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
1293 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
1294 }
1295
1296 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1297 current_annotation = "Clipping flags";
1298 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1299 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1300
1301 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
1302 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
1303 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1304
1305 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
1306 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
1307 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
1308 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1309 }
1310
1311 /* i965 clipping workaround:
1312 * 1) Test for -ve rhw
1313 * 2) If set,
1314 * set ndc = (0,0,0,0)
1315 * set ucp[6] = 1
1316 *
1317 * Later, clipping will detect ucp[6] and ensure the primitive is
1318 * clipped against all fixed planes.
1319 */
1320 if (devinfo->has_negative_rhw_bug &&
1321 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1322 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1323 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1324 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
1325 vec4_instruction *inst;
1326 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
1327 inst->predicate = BRW_PREDICATE_NORMAL;
1328 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1329 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
1330 inst->predicate = BRW_PREDICATE_NORMAL;
1331 }
1332
1333 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1334 } else if (devinfo->gen < 6) {
1335 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
1336 } else {
1337 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
1338 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1339 dst_reg reg_w = reg;
1340 reg_w.writemask = WRITEMASK_W;
1341 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1342 reg_as_src.type = reg_w.type;
1343 reg_as_src.swizzle = brw_swizzle_for_size(1);
1344 emit(MOV(reg_w, reg_as_src));
1345 }
1346 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1347 dst_reg reg_y = reg;
1348 reg_y.writemask = WRITEMASK_Y;
1349 reg_y.type = BRW_REGISTER_TYPE_D;
1350 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1351 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1352 }
1353 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1354 dst_reg reg_z = reg;
1355 reg_z.writemask = WRITEMASK_Z;
1356 reg_z.type = BRW_REGISTER_TYPE_D;
1357 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1358 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1359 }
1360 }
1361 }
1362
1363 vec4_instruction *
1364 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1365 {
1366 assert(varying < VARYING_SLOT_MAX);
1367 assert(output_reg[varying].type == reg.type);
1368 current_annotation = output_reg_annotation[varying];
1369 if (output_reg[varying].file != BAD_FILE)
1370 return emit(MOV(reg, src_reg(output_reg[varying])));
1371 else
1372 return NULL;
1373 }
1374
1375 void
1376 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1377 {
1378 reg.type = BRW_REGISTER_TYPE_F;
1379 output_reg[varying].type = reg.type;
1380
1381 switch (varying) {
1382 case VARYING_SLOT_PSIZ:
1383 {
1384 /* PSIZ is always in slot 0, and is coupled with other flags. */
1385 current_annotation = "indices, point width, clip flags";
1386 emit_psiz_and_flags(reg);
1387 break;
1388 }
1389 case BRW_VARYING_SLOT_NDC:
1390 current_annotation = "NDC";
1391 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1392 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1393 break;
1394 case VARYING_SLOT_POS:
1395 current_annotation = "gl_Position";
1396 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1397 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1398 break;
1399 case VARYING_SLOT_EDGE:
1400 /* This is present when doing unfilled polygons. We're supposed to copy
1401 * the edge flag from the user-provided vertex array
1402 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1403 * of that attribute (starts as 1.0f). This is then used in clipping to
1404 * determine which edges should be drawn as wireframe.
1405 */
1406 current_annotation = "edge flag";
1407 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1408 glsl_type::float_type, WRITEMASK_XYZW))));
1409 break;
1410 case BRW_VARYING_SLOT_PAD:
1411 /* No need to write to this slot */
1412 break;
1413 default:
1414 emit_generic_urb_slot(reg, varying);
1415 break;
1416 }
1417 }
1418
1419 static int
1420 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1421 {
1422 if (devinfo->gen >= 6) {
1423 /* URB data written (does not include the message header reg) must
1424 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1425 * section 5.4.3.2.2: URB_INTERLEAVED.
1426 *
1427 * URB entries are allocated on a multiple of 1024 bits, so an
1428 * extra 128 bits written here to make the end align to 256 is
1429 * no problem.
1430 */
1431 if ((mlen % 2) != 1)
1432 mlen++;
1433 }
1434
1435 return mlen;
1436 }
1437
1438
1439 /**
1440 * Generates the VUE payload plus the necessary URB write instructions to
1441 * output it.
1442 *
1443 * The VUE layout is documented in Volume 2a.
1444 */
1445 void
1446 vec4_visitor::emit_vertex()
1447 {
1448 /* MRF 0 is reserved for the debugger, so start with message header
1449 * in MRF 1.
1450 */
1451 int base_mrf = 1;
1452 int mrf = base_mrf;
1453 /* In the process of generating our URB write message contents, we
1454 * may need to unspill a register or load from an array. Those
1455 * reads would use MRFs 14-15.
1456 */
1457 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1458
1459 /* The following assertion verifies that max_usable_mrf causes an
1460 * even-numbered amount of URB write data, which will meet gen6's
1461 * requirements for length alignment.
1462 */
1463 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1464
1465 /* First mrf is the g0-based message header containing URB handles and
1466 * such.
1467 */
1468 emit_urb_write_header(mrf++);
1469
1470 if (devinfo->gen < 6) {
1471 emit_ndc_computation();
1472 }
1473
1474 /* We may need to split this up into several URB writes, so do them in a
1475 * loop.
1476 */
1477 int slot = 0;
1478 bool complete = false;
1479 do {
1480 /* URB offset is in URB row increments, and each of our MRFs is half of
1481 * one of those, since we're doing interleaved writes.
1482 */
1483 int offset = slot / 2;
1484
1485 mrf = base_mrf + 1;
1486 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1487 emit_urb_slot(dst_reg(MRF, mrf++),
1488 prog_data->vue_map.slot_to_varying[slot]);
1489
1490 /* If this was max_usable_mrf, we can't fit anything more into this
1491 * URB WRITE. Same thing if we reached the maximum length available.
1492 */
1493 if (mrf > max_usable_mrf ||
1494 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1495 slot++;
1496 break;
1497 }
1498 }
1499
1500 complete = slot >= prog_data->vue_map.num_slots;
1501 current_annotation = "URB write";
1502 vec4_instruction *inst = emit_urb_write_opcode(complete);
1503 inst->base_mrf = base_mrf;
1504 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1505 inst->offset += offset;
1506 } while(!complete);
1507 }
1508
1509
1510 src_reg
1511 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1512 src_reg *reladdr, int reg_offset)
1513 {
1514 /* Because we store the values to scratch interleaved like our
1515 * vertex data, we need to scale the vec4 index by 2.
1516 */
1517 int message_header_scale = 2;
1518
1519 /* Pre-gen6, the message header uses byte offsets instead of vec4
1520 * (16-byte) offset units.
1521 */
1522 if (devinfo->gen < 6)
1523 message_header_scale *= 16;
1524
1525 if (reladdr) {
1526 src_reg index = src_reg(this, glsl_type::int_type);
1527
1528 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1529 src_reg(reg_offset)));
1530 emit_before(block, inst, MUL(dst_reg(index), index,
1531 src_reg(message_header_scale)));
1532
1533 return index;
1534 } else {
1535 return src_reg(reg_offset * message_header_scale);
1536 }
1537 }
1538
1539 src_reg
1540 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1541 src_reg *reladdr, int reg_offset)
1542 {
1543 if (reladdr) {
1544 src_reg index = src_reg(this, glsl_type::int_type);
1545
1546 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1547 src_reg(reg_offset)));
1548
1549 /* Pre-gen6, the message header uses byte offsets instead of vec4
1550 * (16-byte) offset units.
1551 */
1552 if (devinfo->gen < 6) {
1553 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
1554 }
1555
1556 return index;
1557 } else if (devinfo->gen >= 8) {
1558 /* Store the offset in a GRF so we can send-from-GRF. */
1559 src_reg offset = src_reg(this, glsl_type::int_type);
1560 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
1561 return offset;
1562 } else {
1563 int message_header_scale = devinfo->gen < 6 ? 16 : 1;
1564 return src_reg(reg_offset * message_header_scale);
1565 }
1566 }
1567
1568 /**
1569 * Emits an instruction before @inst to load the value named by @orig_src
1570 * from scratch space at @base_offset to @temp.
1571 *
1572 * @base_offset is measured in 32-byte units (the size of a register).
1573 */
1574 void
1575 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1576 dst_reg temp, src_reg orig_src,
1577 int base_offset)
1578 {
1579 int reg_offset = base_offset + orig_src.reg_offset;
1580 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1581 reg_offset);
1582
1583 emit_before(block, inst, SCRATCH_READ(temp, index));
1584 }
1585
1586 /**
1587 * Emits an instruction after @inst to store the value to be written
1588 * to @orig_dst to scratch space at @base_offset, from @temp.
1589 *
1590 * @base_offset is measured in 32-byte units (the size of a register).
1591 */
1592 void
1593 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1594 int base_offset)
1595 {
1596 int reg_offset = base_offset + inst->dst.reg_offset;
1597 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1598 reg_offset);
1599
1600 /* Create a temporary register to store *inst's result in.
1601 *
1602 * We have to be careful in MOVing from our temporary result register in
1603 * the scratch write. If we swizzle from channels of the temporary that
1604 * weren't initialized, it will confuse live interval analysis, which will
1605 * make spilling fail to make progress.
1606 */
1607 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1608 inst->dst.type),
1609 brw_swizzle_for_mask(inst->dst.writemask));
1610 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1611 inst->dst.writemask));
1612 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1613 if (inst->opcode != BRW_OPCODE_SEL)
1614 write->predicate = inst->predicate;
1615 write->ir = inst->ir;
1616 write->annotation = inst->annotation;
1617 inst->insert_after(block, write);
1618
1619 inst->dst.file = temp.file;
1620 inst->dst.reg = temp.reg;
1621 inst->dst.reg_offset = temp.reg_offset;
1622 inst->dst.reladdr = NULL;
1623 }
1624
1625 /**
1626 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1627 * adds the scratch read(s) before \p inst. The function also checks for
1628 * recursive reladdr scratch accesses, issuing the corresponding scratch
1629 * loads and rewriting reladdr references accordingly.
1630 *
1631 * \return \p src if it did not require a scratch load, otherwise, the
1632 * register holding the result of the scratch load that the caller should
1633 * use to rewrite src.
1634 */
1635 src_reg
1636 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1637 vec4_instruction *inst, src_reg src)
1638 {
1639 /* Resolve recursive reladdr scratch access by calling ourselves
1640 * with src.reladdr
1641 */
1642 if (src.reladdr)
1643 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1644 *src.reladdr);
1645
1646 /* Now handle scratch access on src */
1647 if (src.file == GRF && scratch_loc[src.reg] != -1) {
1648 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1649 emit_scratch_read(block, inst, temp, src, scratch_loc[src.reg]);
1650 src.reg = temp.reg;
1651 src.reg_offset = temp.reg_offset;
1652 src.reladdr = NULL;
1653 }
1654
1655 return src;
1656 }
1657
1658 /**
1659 * We can't generally support array access in GRF space, because a
1660 * single instruction's destination can only span 2 contiguous
1661 * registers. So, we send all GRF arrays that get variable index
1662 * access to scratch space.
1663 */
1664 void
1665 vec4_visitor::move_grf_array_access_to_scratch()
1666 {
1667 int scratch_loc[this->alloc.count];
1668 memset(scratch_loc, -1, sizeof(scratch_loc));
1669
1670 /* First, calculate the set of virtual GRFs that need to be punted
1671 * to scratch due to having any array access on them, and where in
1672 * scratch.
1673 */
1674 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1675 if (inst->dst.file == GRF && inst->dst.reladdr) {
1676 if (scratch_loc[inst->dst.reg] == -1) {
1677 scratch_loc[inst->dst.reg] = last_scratch;
1678 last_scratch += this->alloc.sizes[inst->dst.reg];
1679 }
1680
1681 for (src_reg *iter = inst->dst.reladdr;
1682 iter->reladdr;
1683 iter = iter->reladdr) {
1684 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1685 scratch_loc[iter->reg] = last_scratch;
1686 last_scratch += this->alloc.sizes[iter->reg];
1687 }
1688 }
1689 }
1690
1691 for (int i = 0 ; i < 3; i++) {
1692 for (src_reg *iter = &inst->src[i];
1693 iter->reladdr;
1694 iter = iter->reladdr) {
1695 if (iter->file == GRF && scratch_loc[iter->reg] == -1) {
1696 scratch_loc[iter->reg] = last_scratch;
1697 last_scratch += this->alloc.sizes[iter->reg];
1698 }
1699 }
1700 }
1701 }
1702
1703 /* Now, for anything that will be accessed through scratch, rewrite
1704 * it to load/store. Note that this is a _safe list walk, because
1705 * we may generate a new scratch_write instruction after the one
1706 * we're processing.
1707 */
1708 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1709 /* Set up the annotation tracking for new generated instructions. */
1710 base_ir = inst->ir;
1711 current_annotation = inst->annotation;
1712
1713 /* First handle scratch access on the dst. Notice we have to handle
1714 * the case where the dst's reladdr also points to scratch space.
1715 */
1716 if (inst->dst.reladdr)
1717 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1718 *inst->dst.reladdr);
1719
1720 /* Now that we have handled any (possibly recursive) reladdr scratch
1721 * accesses for dst we can safely do the scratch write for dst itself
1722 */
1723 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1)
1724 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
1725
1726 /* Now handle scratch access on any src. In this case, since inst->src[i]
1727 * already is a src_reg, we can just call emit_resolve_reladdr with
1728 * inst->src[i] and it will take care of handling scratch loads for
1729 * both src and src.reladdr (recursively).
1730 */
1731 for (int i = 0 ; i < 3; i++) {
1732 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1733 inst->src[i]);
1734 }
1735 }
1736 }
1737
1738 /**
1739 * Emits an instruction before @inst to load the value named by @orig_src
1740 * from the pull constant buffer (surface) at @base_offset to @temp.
1741 */
1742 void
1743 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1744 dst_reg temp, src_reg orig_src,
1745 int base_offset)
1746 {
1747 int reg_offset = base_offset + orig_src.reg_offset;
1748 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
1749 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1750 reg_offset);
1751
1752 emit_pull_constant_load_reg(temp,
1753 index,
1754 offset,
1755 block, inst);
1756 }
1757
1758 /**
1759 * Implements array access of uniforms by inserting a
1760 * PULL_CONSTANT_LOAD instruction.
1761 *
1762 * Unlike temporary GRF array access (where we don't support it due to
1763 * the difficulty of doing relative addressing on instruction
1764 * destinations), we could potentially do array access of uniforms
1765 * that were loaded in GRF space as push constants. In real-world
1766 * usage we've seen, though, the arrays being used are always larger
1767 * than we could load as push constants, so just always move all
1768 * uniform array access out to a pull constant buffer.
1769 */
1770 void
1771 vec4_visitor::move_uniform_array_access_to_pull_constants()
1772 {
1773 int pull_constant_loc[this->uniforms];
1774 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1775 bool nested_reladdr;
1776
1777 /* Walk through and find array access of uniforms. Put a copy of that
1778 * uniform in the pull constant buffer.
1779 *
1780 * Note that we don't move constant-indexed accesses to arrays. No
1781 * testing has been done of the performance impact of this choice.
1782 */
1783 do {
1784 nested_reladdr = false;
1785
1786 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1787 for (int i = 0 ; i < 3; i++) {
1788 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1789 continue;
1790
1791 int uniform = inst->src[i].reg;
1792
1793 if (inst->src[i].reladdr->reladdr)
1794 nested_reladdr = true; /* will need another pass */
1795
1796 /* If this array isn't already present in the pull constant buffer,
1797 * add it.
1798 */
1799 if (pull_constant_loc[uniform] == -1) {
1800 const gl_constant_value **values =
1801 &stage_prog_data->param[uniform * 4];
1802
1803 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1804
1805 assert(uniform < uniform_array_size);
1806 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1807 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1808 = values[j];
1809 }
1810 }
1811
1812 /* Set up the annotation tracking for new generated instructions. */
1813 base_ir = inst->ir;
1814 current_annotation = inst->annotation;
1815
1816 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1817
1818 emit_pull_constant_load(block, inst, temp, inst->src[i],
1819 pull_constant_loc[uniform]);
1820
1821 inst->src[i].file = temp.file;
1822 inst->src[i].reg = temp.reg;
1823 inst->src[i].reg_offset = temp.reg_offset;
1824 inst->src[i].reladdr = NULL;
1825 }
1826 }
1827 } while (nested_reladdr);
1828
1829 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1830 * no need to track them as larger-than-vec4 objects. This will be
1831 * relied on in cutting out unused uniform vectors from push
1832 * constants.
1833 */
1834 split_uniform_registers();
1835 }
1836
1837 void
1838 vec4_visitor::resolve_ud_negate(src_reg *reg)
1839 {
1840 if (reg->type != BRW_REGISTER_TYPE_UD ||
1841 !reg->negate)
1842 return;
1843
1844 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1845 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1846 *reg = temp;
1847 }
1848
1849 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1850 void *log_data,
1851 const struct brw_sampler_prog_key_data *key_tex,
1852 struct brw_vue_prog_data *prog_data,
1853 const nir_shader *shader,
1854 void *mem_ctx,
1855 bool no_spills,
1856 int shader_time_index)
1857 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1858 key_tex(key_tex),
1859 prog_data(prog_data),
1860 fail_msg(NULL),
1861 first_non_payload_grf(0),
1862 need_all_constants_in_pull_buffer(false),
1863 no_spills(no_spills),
1864 shader_time_index(shader_time_index),
1865 last_scratch(0)
1866 {
1867 this->failed = false;
1868
1869 this->base_ir = NULL;
1870 this->current_annotation = NULL;
1871 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1872
1873 this->virtual_grf_start = NULL;
1874 this->virtual_grf_end = NULL;
1875 this->live_intervals = NULL;
1876
1877 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1878
1879 this->uniforms = 0;
1880
1881 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1882 * at least one. See setup_uniforms() in brw_vec4.cpp.
1883 */
1884 this->uniform_array_size = 1;
1885 if (prog_data) {
1886 this->uniform_array_size =
1887 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1888 }
1889
1890 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1891 }
1892
1893 vec4_visitor::~vec4_visitor()
1894 {
1895 }
1896
1897
1898 void
1899 vec4_visitor::fail(const char *format, ...)
1900 {
1901 va_list va;
1902 char *msg;
1903
1904 if (failed)
1905 return;
1906
1907 failed = true;
1908
1909 va_start(va, format);
1910 msg = ralloc_vasprintf(mem_ctx, format, va);
1911 va_end(va);
1912 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1913
1914 this->fail_msg = msg;
1915
1916 if (debug_enabled) {
1917 fprintf(stderr, "%s", msg);
1918 }
1919 }
1920
1921 } /* namespace brw */