Merge remote-tracking branch 'mesa-public/master' into vulkan
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "brw_program.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->regs_written = (dst.file == BAD_FILE ? 0 : 1);
50 this->shadow_compare = false;
51 this->ir = NULL;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_size = 0;
54 this->flag_subreg = 0;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = NULL;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 inst->ir = this->base_ir;
65 inst->annotation = this->current_annotation;
66
67 this->instructions.push_tail(inst);
68
69 return inst;
70 }
71
72 vec4_instruction *
73 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
74 vec4_instruction *new_inst)
75 {
76 new_inst->ir = inst->ir;
77 new_inst->annotation = inst->annotation;
78
79 inst->insert_before(block, new_inst);
80
81 return inst;
82 }
83
84 vec4_instruction *
85 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
86 const src_reg &src1, const src_reg &src2)
87 {
88 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
89 }
90
91
92 vec4_instruction *
93 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
94 const src_reg &src1)
95 {
96 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
101 {
102 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
107 {
108 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
109 }
110
111 vec4_instruction *
112 vec4_visitor::emit(enum opcode opcode)
113 {
114 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
115 }
116
117 #define ALU1(op) \
118 vec4_instruction * \
119 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
120 { \
121 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
122 }
123
124 #define ALU2(op) \
125 vec4_instruction * \
126 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
127 const src_reg &src1) \
128 { \
129 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
130 src0, src1); \
131 }
132
133 #define ALU2_ACC(op) \
134 vec4_instruction * \
135 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
136 const src_reg &src1) \
137 { \
138 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
139 BRW_OPCODE_##op, dst, src0, src1); \
140 inst->writes_accumulator = true; \
141 return inst; \
142 }
143
144 #define ALU3(op) \
145 vec4_instruction * \
146 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
147 const src_reg &src1, const src_reg &src2) \
148 { \
149 assert(devinfo->gen >= 6); \
150 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
151 src0, src1, src2); \
152 }
153
154 ALU1(NOT)
155 ALU1(MOV)
156 ALU1(FRC)
157 ALU1(RNDD)
158 ALU1(RNDE)
159 ALU1(RNDZ)
160 ALU1(F32TO16)
161 ALU1(F16TO32)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2_ACC(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(DP3)
169 ALU2(DP4)
170 ALU2(DPH)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182 ALU3(MAD)
183 ALU2_ACC(ADDC)
184 ALU2_ACC(SUBB)
185 ALU2(MAC)
186
187 /** Gen4 predicated IF. */
188 vec4_instruction *
189 vec4_visitor::IF(enum brw_predicate predicate)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
194 inst->predicate = predicate;
195
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 vec4_instruction *
201 vec4_visitor::IF(src_reg src0, src_reg src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(devinfo->gen == 6);
205
206 vec4_instruction *inst;
207
208 resolve_ud_negate(&src0);
209 resolve_ud_negate(&src1);
210
211 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
212 src0, src1);
213 inst->conditional_mod = condition;
214
215 return inst;
216 }
217
218 /**
219 * CMP: Sets the low bit of the destination channels with the result
220 * of the comparison, while the upper bits are undefined, and updates
221 * the flag register with the packed 16 bits of the result.
222 */
223 vec4_instruction *
224 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
225 enum brw_conditional_mod condition)
226 {
227 vec4_instruction *inst;
228
229 /* Take the instruction:
230 *
231 * CMP null<d> src0<f> src1<f>
232 *
233 * Original gen4 does type conversion to the destination type before
234 * comparison, producing garbage results for floating point comparisons.
235 *
236 * The destination type doesn't matter on newer generations, so we set the
237 * type to match src0 so we can compact the instruction.
238 */
239 dst.type = src0.type;
240
241 resolve_ud_negate(&src0);
242 resolve_ud_negate(&src1);
243
244 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
245 inst->conditional_mod = condition;
246
247 return inst;
248 }
249
250 vec4_instruction *
251 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
252 {
253 vec4_instruction *inst;
254
255 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
256 dst, index);
257 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
258 inst->mlen = 2;
259
260 return inst;
261 }
262
263 vec4_instruction *
264 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
265 const src_reg &index)
266 {
267 vec4_instruction *inst;
268
269 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
270 dst, src, index);
271 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
272 inst->mlen = 3;
273
274 return inst;
275 }
276
277 src_reg
278 vec4_visitor::fix_3src_operand(const src_reg &src)
279 {
280 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
281 * able to use vertical stride of zero to replicate the vec4 uniform, like
282 *
283 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
284 *
285 * But you can't, since vertical stride is always four in three-source
286 * instructions. Instead, insert a MOV instruction to do the replication so
287 * that the three-source instruction can consume it.
288 */
289
290 /* The MOV is only needed if the source is a uniform or immediate. */
291 if (src.file != UNIFORM && src.file != IMM)
292 return src;
293
294 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
295 return src;
296
297 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
298 expanded.type = src.type;
299 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
300 return src_reg(expanded);
301 }
302
303 src_reg
304 vec4_visitor::resolve_source_modifiers(const src_reg &src)
305 {
306 if (!src.abs && !src.negate)
307 return src;
308
309 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
310 resolved.type = src.type;
311 emit(MOV(resolved, src));
312
313 return src_reg(resolved);
314 }
315
316 src_reg
317 vec4_visitor::fix_math_operand(const src_reg &src)
318 {
319 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
320 return src;
321
322 /* The gen6 math instruction ignores the source modifiers --
323 * swizzle, abs, negate, and at least some parts of the register
324 * region description.
325 *
326 * Rather than trying to enumerate all these cases, *always* expand the
327 * operand to a temp GRF for gen6.
328 *
329 * For gen7, keep the operand as-is, except if immediate, which gen7 still
330 * can't use.
331 */
332
333 if (devinfo->gen == 7 && src.file != IMM)
334 return src;
335
336 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
337 expanded.type = src.type;
338 emit(MOV(expanded, src));
339 return src_reg(expanded);
340 }
341
342 vec4_instruction *
343 vec4_visitor::emit_math(enum opcode opcode,
344 const dst_reg &dst,
345 const src_reg &src0, const src_reg &src1)
346 {
347 vec4_instruction *math =
348 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
349
350 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
351 /* MATH on Gen6 must be align1, so we can't do writemasks. */
352 math->dst = dst_reg(this, glsl_type::vec4_type);
353 math->dst.type = dst.type;
354 math = emit(MOV(dst, src_reg(math->dst)));
355 } else if (devinfo->gen < 6) {
356 math->base_mrf = 1;
357 math->mlen = src1.file == BAD_FILE ? 1 : 2;
358 }
359
360 return math;
361 }
362
363 void
364 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
365 {
366 if (devinfo->gen < 7) {
367 unreachable("ir_unop_pack_half_2x16 should be lowered");
368 }
369
370 assert(dst.type == BRW_REGISTER_TYPE_UD);
371 assert(src0.type == BRW_REGISTER_TYPE_F);
372
373 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
374 *
375 * Because this instruction does not have a 16-bit floating-point type,
376 * the destination data type must be Word (W).
377 *
378 * The destination must be DWord-aligned and specify a horizontal stride
379 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
380 * each destination channel and the upper word is not modified.
381 *
382 * The above restriction implies that the f32to16 instruction must use
383 * align1 mode, because only in align1 mode is it possible to specify
384 * horizontal stride. We choose here to defy the hardware docs and emit
385 * align16 instructions.
386 *
387 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
388 * instructions. I was partially successful in that the code passed all
389 * tests. However, the code was dubiously correct and fragile, and the
390 * tests were not harsh enough to probe that frailty. Not trusting the
391 * code, I chose instead to remain in align16 mode in defiance of the hw
392 * docs).
393 *
394 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
395 * simulator, emitting a f32to16 in align16 mode with UD as destination
396 * data type is safe. The behavior differs from that specified in the PRM
397 * in that the upper word of each destination channel is cleared to 0.
398 */
399
400 dst_reg tmp_dst(this, glsl_type::uvec2_type);
401 src_reg tmp_src(tmp_dst);
402
403 #if 0
404 /* Verify the undocumented behavior on which the following instructions
405 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
406 * then the result of the bit-or instruction below will be incorrect.
407 *
408 * You should inspect the disasm output in order to verify that the MOV is
409 * not optimized away.
410 */
411 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
412 #endif
413
414 /* Give tmp the form below, where "." means untouched.
415 *
416 * w z y x w z y x
417 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
418 *
419 * That the upper word of each write-channel be 0 is required for the
420 * following bit-shift and bit-or instructions to work. Note that this
421 * relies on the undocumented hardware behavior mentioned above.
422 */
423 tmp_dst.writemask = WRITEMASK_XY;
424 emit(F32TO16(tmp_dst, src0));
425
426 /* Give the write-channels of dst the form:
427 * 0xhhhh0000
428 */
429 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
430 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
431
432 /* Finally, give the write-channels of dst the form of packHalf2x16's
433 * output:
434 * 0xhhhhllll
435 */
436 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
437 emit(OR(dst, src_reg(dst), tmp_src));
438 }
439
440 void
441 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
442 {
443 if (devinfo->gen < 7) {
444 unreachable("ir_unop_unpack_half_2x16 should be lowered");
445 }
446
447 assert(dst.type == BRW_REGISTER_TYPE_F);
448 assert(src0.type == BRW_REGISTER_TYPE_UD);
449
450 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
451 *
452 * Because this instruction does not have a 16-bit floating-point type,
453 * the source data type must be Word (W). The destination type must be
454 * F (Float).
455 *
456 * To use W as the source data type, we must adjust horizontal strides,
457 * which is only possible in align1 mode. All my [chadv] attempts at
458 * emitting align1 instructions for unpackHalf2x16 failed to pass the
459 * Piglit tests, so I gave up.
460 *
461 * I've verified that, on gen7 hardware and the simulator, it is safe to
462 * emit f16to32 in align16 mode with UD as source data type.
463 */
464
465 dst_reg tmp_dst(this, glsl_type::uvec2_type);
466 src_reg tmp_src(tmp_dst);
467
468 tmp_dst.writemask = WRITEMASK_X;
469 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
470
471 tmp_dst.writemask = WRITEMASK_Y;
472 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
473
474 dst.writemask = WRITEMASK_XY;
475 emit(F16TO32(dst, tmp_src));
476 }
477
478 void
479 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
480 {
481 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
482 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
483 * is not suitable to generate the shift values, but we can use the packed
484 * vector float and a type-converting MOV.
485 */
486 dst_reg shift(this, glsl_type::uvec4_type);
487 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
488
489 dst_reg shifted(this, glsl_type::uvec4_type);
490 src0.swizzle = BRW_SWIZZLE_XXXX;
491 emit(SHR(shifted, src0, src_reg(shift)));
492
493 shifted.type = BRW_REGISTER_TYPE_UB;
494 dst_reg f(this, glsl_type::vec4_type);
495 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
496
497 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
498 }
499
500 void
501 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
502 {
503 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
504 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
505 * is not suitable to generate the shift values, but we can use the packed
506 * vector float and a type-converting MOV.
507 */
508 dst_reg shift(this, glsl_type::uvec4_type);
509 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
510
511 dst_reg shifted(this, glsl_type::uvec4_type);
512 src0.swizzle = BRW_SWIZZLE_XXXX;
513 emit(SHR(shifted, src0, src_reg(shift)));
514
515 shifted.type = BRW_REGISTER_TYPE_B;
516 dst_reg f(this, glsl_type::vec4_type);
517 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
518
519 dst_reg scaled(this, glsl_type::vec4_type);
520 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
521
522 dst_reg max(this, glsl_type::vec4_type);
523 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
524 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
525 }
526
527 void
528 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
529 {
530 dst_reg saturated(this, glsl_type::vec4_type);
531 vec4_instruction *inst = emit(MOV(saturated, src0));
532 inst->saturate = true;
533
534 dst_reg scaled(this, glsl_type::vec4_type);
535 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
536
537 dst_reg rounded(this, glsl_type::vec4_type);
538 emit(RNDE(rounded, src_reg(scaled)));
539
540 dst_reg u(this, glsl_type::uvec4_type);
541 emit(MOV(u, src_reg(rounded)));
542
543 src_reg bytes(u);
544 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
545 }
546
547 void
548 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
549 {
550 dst_reg max(this, glsl_type::vec4_type);
551 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
552
553 dst_reg min(this, glsl_type::vec4_type);
554 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
555
556 dst_reg scaled(this, glsl_type::vec4_type);
557 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
558
559 dst_reg rounded(this, glsl_type::vec4_type);
560 emit(RNDE(rounded, src_reg(scaled)));
561
562 dst_reg i(this, glsl_type::ivec4_type);
563 emit(MOV(i, src_reg(rounded)));
564
565 src_reg bytes(i);
566 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
567 }
568
569 /**
570 * Returns the minimum number of vec4 elements needed to pack a type.
571 *
572 * For simple types, it will return 1 (a single vec4); for matrices, the
573 * number of columns; for array and struct, the sum of the vec4_size of
574 * each of its elements; and for sampler and atomic, zero.
575 *
576 * This method is useful to calculate how much register space is needed to
577 * store a particular type.
578 */
579 extern "C" int
580 type_size_vec4(const struct glsl_type *type)
581 {
582 unsigned int i;
583 int size;
584
585 switch (type->base_type) {
586 case GLSL_TYPE_UINT:
587 case GLSL_TYPE_INT:
588 case GLSL_TYPE_FLOAT:
589 case GLSL_TYPE_BOOL:
590 if (type->is_matrix()) {
591 return type->matrix_columns;
592 } else {
593 /* Regardless of size of vector, it gets a vec4. This is bad
594 * packing for things like floats, but otherwise arrays become a
595 * mess. Hopefully a later pass over the code can pack scalars
596 * down if appropriate.
597 */
598 return 1;
599 }
600 case GLSL_TYPE_ARRAY:
601 assert(type->length > 0);
602 return type_size_vec4(type->fields.array) * type->length;
603 case GLSL_TYPE_STRUCT:
604 size = 0;
605 for (i = 0; i < type->length; i++) {
606 size += type_size_vec4(type->fields.structure[i].type);
607 }
608 return size;
609 case GLSL_TYPE_SUBROUTINE:
610 return 1;
611
612 case GLSL_TYPE_SAMPLER:
613 /* Samplers take up no register space, since they're baked in at
614 * link time.
615 */
616 return 0;
617 case GLSL_TYPE_ATOMIC_UINT:
618 return 0;
619 case GLSL_TYPE_IMAGE:
620 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
621 case GLSL_TYPE_VOID:
622 case GLSL_TYPE_DOUBLE:
623 case GLSL_TYPE_ERROR:
624 case GLSL_TYPE_INTERFACE:
625 case GLSL_TYPE_FUNCTION:
626 unreachable("not reached");
627 }
628
629 return 0;
630 }
631
632 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
633 {
634 init();
635
636 this->file = VGRF;
637 this->nr = v->alloc.allocate(type_size_vec4(type));
638
639 if (type->is_array() || type->is_record()) {
640 this->swizzle = BRW_SWIZZLE_NOOP;
641 } else {
642 this->swizzle = brw_swizzle_for_size(type->vector_elements);
643 }
644
645 this->type = brw_type_for_base_type(type);
646 }
647
648 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
649 {
650 assert(size > 0);
651
652 init();
653
654 this->file = VGRF;
655 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
656
657 this->swizzle = BRW_SWIZZLE_NOOP;
658
659 this->type = brw_type_for_base_type(type);
660 }
661
662 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
663 {
664 init();
665
666 this->file = VGRF;
667 this->nr = v->alloc.allocate(type_size_vec4(type));
668
669 if (type->is_array() || type->is_record()) {
670 this->writemask = WRITEMASK_XYZW;
671 } else {
672 this->writemask = (1 << type->vector_elements) - 1;
673 }
674
675 this->type = brw_type_for_base_type(type);
676 }
677
678 vec4_instruction *
679 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
680 src_reg src0, src_reg src1)
681 {
682 vec4_instruction *inst;
683
684 if (devinfo->gen >= 6) {
685 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
686 inst->conditional_mod = conditionalmod;
687 } else {
688 emit(CMP(dst, src0, src1, conditionalmod));
689
690 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
691 inst->predicate = BRW_PREDICATE_NORMAL;
692 }
693
694 return inst;
695 }
696
697 vec4_instruction *
698 vec4_visitor::emit_lrp(const dst_reg &dst,
699 const src_reg &x, const src_reg &y, const src_reg &a)
700 {
701 if (devinfo->gen >= 6) {
702 /* Note that the instruction's argument order is reversed from GLSL
703 * and the IR.
704 */
705 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
706 fix_3src_operand(x)));
707 } else {
708 /* Earlier generations don't support three source operations, so we
709 * need to emit x*(1-a) + y*a.
710 */
711 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
712 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
713 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
714 y_times_a.writemask = dst.writemask;
715 one_minus_a.writemask = dst.writemask;
716 x_times_one_minus_a.writemask = dst.writemask;
717
718 emit(MUL(y_times_a, y, a));
719 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
720 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
721 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
722 }
723 }
724
725 /**
726 * Emits the instructions needed to perform a pull constant load. before_block
727 * and before_inst can be NULL in which case the instruction will be appended
728 * to the end of the instruction list.
729 */
730 void
731 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
732 src_reg surf_index,
733 src_reg offset_reg,
734 bblock_t *before_block,
735 vec4_instruction *before_inst)
736 {
737 assert((before_inst == NULL && before_block == NULL) ||
738 (before_inst && before_block));
739
740 vec4_instruction *pull;
741
742 if (devinfo->gen >= 9) {
743 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
744 src_reg header(this, glsl_type::uvec4_type, 2);
745
746 pull = new(mem_ctx)
747 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
748 dst_reg(header));
749
750 if (before_inst)
751 emit_before(before_block, before_inst, pull);
752 else
753 emit(pull);
754
755 dst_reg index_reg = retype(offset(dst_reg(header), 1),
756 offset_reg.type);
757 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
758
759 if (before_inst)
760 emit_before(before_block, before_inst, pull);
761 else
762 emit(pull);
763
764 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
765 dst,
766 surf_index,
767 header);
768 pull->mlen = 2;
769 pull->header_size = 1;
770 } else if (devinfo->gen >= 7) {
771 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
772
773 grf_offset.type = offset_reg.type;
774
775 pull = MOV(grf_offset, offset_reg);
776
777 if (before_inst)
778 emit_before(before_block, before_inst, pull);
779 else
780 emit(pull);
781
782 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
783 dst,
784 surf_index,
785 src_reg(grf_offset));
786 pull->mlen = 1;
787 } else {
788 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
789 dst,
790 surf_index,
791 offset_reg);
792 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
793 pull->mlen = 1;
794 }
795
796 if (before_inst)
797 emit_before(before_block, before_inst, pull);
798 else
799 emit(pull);
800 }
801
802 src_reg
803 vec4_visitor::emit_uniformize(const src_reg &src)
804 {
805 const src_reg chan_index(this, glsl_type::uint_type);
806 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
807 src.type);
808
809 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
810 ->force_writemask_all = true;
811 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
812 ->force_writemask_all = true;
813
814 return src_reg(dst);
815 }
816
817 src_reg
818 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
819 src_reg coordinate, src_reg sampler)
820 {
821 vec4_instruction *inst =
822 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
823 dst_reg(this, glsl_type::uvec4_type));
824 inst->base_mrf = 2;
825 inst->src[1] = sampler;
826
827 int param_base;
828
829 if (devinfo->gen >= 9) {
830 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
831 vec4_instruction *header_inst = new(mem_ctx)
832 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
833 dst_reg(MRF, inst->base_mrf));
834
835 emit(header_inst);
836
837 inst->mlen = 2;
838 inst->header_size = 1;
839 param_base = inst->base_mrf + 1;
840 } else {
841 inst->mlen = 1;
842 param_base = inst->base_mrf;
843 }
844
845 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
846 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
847 int zero_mask = 0xf & ~coord_mask;
848
849 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
850 coordinate));
851
852 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
853 brw_imm_d(0)));
854
855 emit(inst);
856 return src_reg(inst->dst);
857 }
858
859 bool
860 vec4_visitor::is_high_sampler(src_reg sampler)
861 {
862 if (devinfo->gen < 8 && !devinfo->is_haswell)
863 return false;
864
865 return sampler.file != IMM || sampler.ud >= 16;
866 }
867
868 void
869 vec4_visitor::emit_texture(ir_texture_opcode op,
870 dst_reg dest,
871 const glsl_type *dest_type,
872 src_reg coordinate,
873 int coord_components,
874 src_reg shadow_comparitor,
875 src_reg lod, src_reg lod2,
876 src_reg sample_index,
877 uint32_t constant_offset,
878 src_reg offset_value,
879 src_reg mcs,
880 bool is_cube_array,
881 uint32_t surface,
882 src_reg surface_reg,
883 uint32_t sampler,
884 src_reg sampler_reg)
885 {
886 /* The sampler can only meaningfully compute LOD for fragment shader
887 * messages. For all other stages, we change the opcode to TXL and hardcode
888 * the LOD to 0.
889 *
890 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
891 * valid LOD argument.
892 */
893 if (op == ir_tex || op == ir_query_levels) {
894 assert(lod.file == BAD_FILE);
895 lod = brw_imm_f(0.0f);
896 }
897
898 enum opcode opcode;
899 switch (op) {
900 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
901 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
902 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
903 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
904 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
905 SHADER_OPCODE_TXF_CMS); break;
906 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
907 case ir_tg4: opcode = offset_value.file != BAD_FILE
908 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
909 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
910 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
911 case ir_txb:
912 unreachable("TXB is not valid for vertex shaders.");
913 case ir_lod:
914 unreachable("LOD is not valid for vertex shaders.");
915 case ir_samples_identical: {
916 /* There are some challenges implementing this for vec4, and it seems
917 * unlikely to be used anyway. For now, just return false ways.
918 */
919 emit(MOV(dest, brw_imm_ud(0u)));
920 return;
921 }
922 default:
923 unreachable("Unrecognized tex op");
924 }
925
926 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
927
928 inst->offset = constant_offset;
929
930 /* The message header is necessary for:
931 * - Gen4 (always)
932 * - Gen9+ for selecting SIMD4x2
933 * - Texel offsets
934 * - Gather channel selection
935 * - Sampler indices too large to fit in a 4-bit value.
936 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
937 */
938 inst->header_size =
939 (devinfo->gen < 5 || devinfo->gen >= 9 ||
940 inst->offset != 0 || op == ir_tg4 ||
941 op == ir_texture_samples ||
942 is_high_sampler(sampler_reg)) ? 1 : 0;
943 inst->base_mrf = 2;
944 inst->mlen = inst->header_size;
945 inst->dst.writemask = WRITEMASK_XYZW;
946 inst->shadow_compare = shadow_comparitor.file != BAD_FILE;
947
948 inst->src[1] = surface_reg;
949 inst->src[2] = sampler_reg;
950
951 /* MRF for the first parameter */
952 int param_base = inst->base_mrf + inst->header_size;
953
954 if (op == ir_txs || op == ir_query_levels) {
955 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
956 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
957 inst->mlen++;
958 } else if (op == ir_texture_samples) {
959 inst->dst.writemask = WRITEMASK_X;
960 } else {
961 /* Load the coordinate */
962 /* FINISHME: gl_clamp_mask and saturate */
963 int coord_mask = (1 << coord_components) - 1;
964 int zero_mask = 0xf & ~coord_mask;
965
966 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
967 coordinate));
968 inst->mlen++;
969
970 if (zero_mask != 0) {
971 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
972 brw_imm_d(0)));
973 }
974 /* Load the shadow comparitor */
975 if (shadow_comparitor.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
976 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparitor.type,
977 WRITEMASK_X),
978 shadow_comparitor));
979 inst->mlen++;
980 }
981
982 /* Load the LOD info */
983 if (op == ir_tex || op == ir_txl) {
984 int mrf, writemask;
985 if (devinfo->gen >= 5) {
986 mrf = param_base + 1;
987 if (shadow_comparitor.file != BAD_FILE) {
988 writemask = WRITEMASK_Y;
989 /* mlen already incremented */
990 } else {
991 writemask = WRITEMASK_X;
992 inst->mlen++;
993 }
994 } else /* devinfo->gen == 4 */ {
995 mrf = param_base;
996 writemask = WRITEMASK_W;
997 }
998 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
999 } else if (op == ir_txf) {
1000 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1001 } else if (op == ir_txf_ms) {
1002 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1003 sample_index));
1004 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1005 /* MCS data is stored in the first two channels of ‘mcs’, but we
1006 * need to get it into the .y and .z channels of the second vec4
1007 * of params.
1008 */
1009 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1010 emit(MOV(dst_reg(MRF, param_base + 1,
1011 glsl_type::uint_type, WRITEMASK_YZ),
1012 mcs));
1013 } else if (devinfo->gen >= 7) {
1014 /* MCS data is in the first channel of `mcs`, but we need to get it into
1015 * the .y channel of the second vec4 of params, so replicate .x across
1016 * the whole vec4 and then mask off everything except .y
1017 */
1018 mcs.swizzle = BRW_SWIZZLE_XXXX;
1019 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1020 mcs));
1021 }
1022 inst->mlen++;
1023 } else if (op == ir_txd) {
1024 const brw_reg_type type = lod.type;
1025
1026 if (devinfo->gen >= 5) {
1027 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1028 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1029 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1030 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1031 inst->mlen++;
1032
1033 if (dest_type->vector_elements == 3 || shadow_comparitor.file != BAD_FILE) {
1034 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1035 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1036 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1037 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1038 inst->mlen++;
1039
1040 if (shadow_comparitor.file != BAD_FILE) {
1041 emit(MOV(dst_reg(MRF, param_base + 2,
1042 shadow_comparitor.type, WRITEMASK_Z),
1043 shadow_comparitor));
1044 }
1045 }
1046 } else /* devinfo->gen == 4 */ {
1047 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1048 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1049 inst->mlen += 2;
1050 }
1051 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1052 if (shadow_comparitor.file != BAD_FILE) {
1053 emit(MOV(dst_reg(MRF, param_base, shadow_comparitor.type, WRITEMASK_W),
1054 shadow_comparitor));
1055 }
1056
1057 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1058 offset_value));
1059 inst->mlen++;
1060 }
1061 }
1062
1063 emit(inst);
1064
1065 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1066 * spec requires layers.
1067 */
1068 if (op == ir_txs && is_cube_array) {
1069 emit_math(SHADER_OPCODE_INT_QUOTIENT,
1070 writemask(inst->dst, WRITEMASK_Z),
1071 src_reg(inst->dst), brw_imm_d(6));
1072 }
1073
1074 if (devinfo->gen == 6 && op == ir_tg4) {
1075 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1076 }
1077
1078 if (op == ir_query_levels) {
1079 /* # levels is in .w */
1080 src_reg swizzled(dest);
1081 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1082 SWIZZLE_W, SWIZZLE_W);
1083 emit(MOV(dest, swizzled));
1084 }
1085 }
1086
1087 /**
1088 * Apply workarounds for Gen6 gather with UINT/SINT
1089 */
1090 void
1091 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1092 {
1093 if (!wa)
1094 return;
1095
1096 int width = (wa & WA_8BIT) ? 8 : 16;
1097 dst_reg dst_f = dst;
1098 dst_f.type = BRW_REGISTER_TYPE_F;
1099
1100 /* Convert from UNORM to UINT */
1101 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1102 emit(MOV(dst, src_reg(dst_f)));
1103
1104 if (wa & WA_SIGN) {
1105 /* Reinterpret the UINT value as a signed INT value by
1106 * shifting the sign bit into place, then shifting back
1107 * preserving sign.
1108 */
1109 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1110 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1111 }
1112 }
1113
1114 void
1115 vec4_visitor::gs_emit_vertex(int stream_id)
1116 {
1117 unreachable("not reached");
1118 }
1119
1120 void
1121 vec4_visitor::gs_end_primitive()
1122 {
1123 unreachable("not reached");
1124 }
1125
1126 void
1127 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
1128 dst_reg dst, src_reg surf_offset,
1129 src_reg src0, src_reg src1)
1130 {
1131 unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
1132 src_reg src_payload(this, glsl_type::uint_type, mlen);
1133 dst_reg payload(src_payload);
1134 payload.writemask = WRITEMASK_X;
1135
1136 /* Set the atomic operation offset. */
1137 emit(MOV(offset(payload, 0), surf_offset));
1138 unsigned i = 1;
1139
1140 /* Set the atomic operation arguments. */
1141 if (src0.file != BAD_FILE) {
1142 emit(MOV(offset(payload, i), src0));
1143 i++;
1144 }
1145
1146 if (src1.file != BAD_FILE) {
1147 emit(MOV(offset(payload, i), src1));
1148 i++;
1149 }
1150
1151 /* Emit the instruction. Note that this maps to the normal SIMD8
1152 * untyped atomic message on Ivy Bridge, but that's OK because
1153 * unused channels will be masked out.
1154 */
1155 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
1156 src_payload,
1157 brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
1158 inst->mlen = mlen;
1159 }
1160
1161 void
1162 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
1163 src_reg surf_offset)
1164 {
1165 dst_reg offset(this, glsl_type::uint_type);
1166 offset.writemask = WRITEMASK_X;
1167
1168 /* Set the surface read offset. */
1169 emit(MOV(offset, surf_offset));
1170
1171 /* Emit the instruction. Note that this maps to the normal SIMD8
1172 * untyped surface read message, but that's OK because unused
1173 * channels will be masked out.
1174 */
1175 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
1176 src_reg(offset),
1177 brw_imm_ud(surf_index), brw_imm_d(1));
1178 inst->mlen = 1;
1179 }
1180
1181 void
1182 vec4_visitor::emit_ndc_computation()
1183 {
1184 if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
1185 return;
1186
1187 /* Get the position */
1188 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
1189
1190 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1191 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1192 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
1193
1194 current_annotation = "NDC";
1195 dst_reg ndc_w = ndc;
1196 ndc_w.writemask = WRITEMASK_W;
1197 src_reg pos_w = pos;
1198 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1199 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1200
1201 dst_reg ndc_xyz = ndc;
1202 ndc_xyz.writemask = WRITEMASK_XYZ;
1203
1204 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1205 }
1206
1207 void
1208 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1209 {
1210 if (devinfo->gen < 6 &&
1211 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1212 output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE ||
1213 devinfo->has_negative_rhw_bug)) {
1214 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1215 dst_reg header1_w = header1;
1216 header1_w.writemask = WRITEMASK_W;
1217
1218 emit(MOV(header1, brw_imm_ud(0u)));
1219
1220 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1221 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1222
1223 current_annotation = "Point size";
1224 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1225 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1226 }
1227
1228 if (output_reg[VARYING_SLOT_CLIP_DIST0].file != BAD_FILE) {
1229 current_annotation = "Clipping flags";
1230 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1231 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1232
1233 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1234 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1235 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1236
1237 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1238 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1239 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1240 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1241 }
1242
1243 /* i965 clipping workaround:
1244 * 1) Test for -ve rhw
1245 * 2) If set,
1246 * set ndc = (0,0,0,0)
1247 * set ucp[6] = 1
1248 *
1249 * Later, clipping will detect ucp[6] and ensure the primitive is
1250 * clipped against all fixed planes.
1251 */
1252 if (devinfo->has_negative_rhw_bug &&
1253 output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE) {
1254 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
1255 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1256 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1257 vec4_instruction *inst;
1258 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1259 inst->predicate = BRW_PREDICATE_NORMAL;
1260 output_reg[BRW_VARYING_SLOT_NDC].type = BRW_REGISTER_TYPE_F;
1261 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], brw_imm_f(0.0f)));
1262 inst->predicate = BRW_PREDICATE_NORMAL;
1263 }
1264
1265 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1266 } else if (devinfo->gen < 6) {
1267 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1268 } else {
1269 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1270 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1271 dst_reg reg_w = reg;
1272 reg_w.writemask = WRITEMASK_W;
1273 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ]);
1274 reg_as_src.type = reg_w.type;
1275 reg_as_src.swizzle = brw_swizzle_for_size(1);
1276 emit(MOV(reg_w, reg_as_src));
1277 }
1278 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1279 dst_reg reg_y = reg;
1280 reg_y.writemask = WRITEMASK_Y;
1281 reg_y.type = BRW_REGISTER_TYPE_D;
1282 output_reg[VARYING_SLOT_LAYER].type = reg_y.type;
1283 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
1284 }
1285 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1286 dst_reg reg_z = reg;
1287 reg_z.writemask = WRITEMASK_Z;
1288 reg_z.type = BRW_REGISTER_TYPE_D;
1289 output_reg[VARYING_SLOT_VIEWPORT].type = reg_z.type;
1290 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
1291 }
1292 }
1293 }
1294
1295 vec4_instruction *
1296 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
1297 {
1298 assert(varying < VARYING_SLOT_MAX);
1299 assert(output_reg[varying].type == reg.type);
1300 current_annotation = output_reg_annotation[varying];
1301 if (output_reg[varying].file != BAD_FILE)
1302 return emit(MOV(reg, src_reg(output_reg[varying])));
1303 else
1304 return NULL;
1305 }
1306
1307 void
1308 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1309 {
1310 reg.type = BRW_REGISTER_TYPE_F;
1311 output_reg[varying].type = reg.type;
1312
1313 switch (varying) {
1314 case VARYING_SLOT_PSIZ:
1315 {
1316 /* PSIZ is always in slot 0, and is coupled with other flags. */
1317 current_annotation = "indices, point width, clip flags";
1318 emit_psiz_and_flags(reg);
1319 break;
1320 }
1321 case BRW_VARYING_SLOT_NDC:
1322 current_annotation = "NDC";
1323 if (output_reg[BRW_VARYING_SLOT_NDC].file != BAD_FILE)
1324 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
1325 break;
1326 case VARYING_SLOT_POS:
1327 current_annotation = "gl_Position";
1328 if (output_reg[VARYING_SLOT_POS].file != BAD_FILE)
1329 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
1330 break;
1331 case VARYING_SLOT_EDGE:
1332 /* This is present when doing unfilled polygons. We're supposed to copy
1333 * the edge flag from the user-provided vertex array
1334 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1335 * of that attribute (starts as 1.0f). This is then used in clipping to
1336 * determine which edges should be drawn as wireframe.
1337 */
1338 current_annotation = "edge flag";
1339 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1340 glsl_type::float_type, WRITEMASK_XYZW))));
1341 break;
1342 case BRW_VARYING_SLOT_PAD:
1343 /* No need to write to this slot */
1344 break;
1345 default:
1346 emit_generic_urb_slot(reg, varying);
1347 break;
1348 }
1349 }
1350
1351 static int
1352 align_interleaved_urb_mlen(const struct brw_device_info *devinfo, int mlen)
1353 {
1354 if (devinfo->gen >= 6) {
1355 /* URB data written (does not include the message header reg) must
1356 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1357 * section 5.4.3.2.2: URB_INTERLEAVED.
1358 *
1359 * URB entries are allocated on a multiple of 1024 bits, so an
1360 * extra 128 bits written here to make the end align to 256 is
1361 * no problem.
1362 */
1363 if ((mlen % 2) != 1)
1364 mlen++;
1365 }
1366
1367 return mlen;
1368 }
1369
1370
1371 /**
1372 * Generates the VUE payload plus the necessary URB write instructions to
1373 * output it.
1374 *
1375 * The VUE layout is documented in Volume 2a.
1376 */
1377 void
1378 vec4_visitor::emit_vertex()
1379 {
1380 /* MRF 0 is reserved for the debugger, so start with message header
1381 * in MRF 1.
1382 */
1383 int base_mrf = 1;
1384 int mrf = base_mrf;
1385 /* In the process of generating our URB write message contents, we
1386 * may need to unspill a register or load from an array. Those
1387 * reads would use MRFs 14-15.
1388 */
1389 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1390
1391 /* The following assertion verifies that max_usable_mrf causes an
1392 * even-numbered amount of URB write data, which will meet gen6's
1393 * requirements for length alignment.
1394 */
1395 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1396
1397 /* First mrf is the g0-based message header containing URB handles and
1398 * such.
1399 */
1400 emit_urb_write_header(mrf++);
1401
1402 if (devinfo->gen < 6) {
1403 emit_ndc_computation();
1404 }
1405
1406 /* We may need to split this up into several URB writes, so do them in a
1407 * loop.
1408 */
1409 int slot = 0;
1410 bool complete = false;
1411 do {
1412 /* URB offset is in URB row increments, and each of our MRFs is half of
1413 * one of those, since we're doing interleaved writes.
1414 */
1415 int offset = slot / 2;
1416
1417 mrf = base_mrf + 1;
1418 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1419 emit_urb_slot(dst_reg(MRF, mrf++),
1420 prog_data->vue_map.slot_to_varying[slot]);
1421
1422 /* If this was max_usable_mrf, we can't fit anything more into this
1423 * URB WRITE. Same thing if we reached the maximum length available.
1424 */
1425 if (mrf > max_usable_mrf ||
1426 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1427 slot++;
1428 break;
1429 }
1430 }
1431
1432 complete = slot >= prog_data->vue_map.num_slots;
1433 current_annotation = "URB write";
1434 vec4_instruction *inst = emit_urb_write_opcode(complete);
1435 inst->base_mrf = base_mrf;
1436 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1437 inst->offset += offset;
1438 } while(!complete);
1439 }
1440
1441
1442 src_reg
1443 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1444 src_reg *reladdr, int reg_offset)
1445 {
1446 /* Because we store the values to scratch interleaved like our
1447 * vertex data, we need to scale the vec4 index by 2.
1448 */
1449 int message_header_scale = 2;
1450
1451 /* Pre-gen6, the message header uses byte offsets instead of vec4
1452 * (16-byte) offset units.
1453 */
1454 if (devinfo->gen < 6)
1455 message_header_scale *= 16;
1456
1457 if (reladdr) {
1458 src_reg index = src_reg(this, glsl_type::int_type);
1459
1460 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1461 brw_imm_d(reg_offset)));
1462 emit_before(block, inst, MUL(dst_reg(index), index,
1463 brw_imm_d(message_header_scale)));
1464
1465 return index;
1466 } else {
1467 return brw_imm_d(reg_offset * message_header_scale);
1468 }
1469 }
1470
1471 src_reg
1472 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
1473 src_reg *reladdr, int reg_offset)
1474 {
1475 if (reladdr) {
1476 src_reg index = src_reg(this, glsl_type::int_type);
1477
1478 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1479 brw_imm_d(reg_offset * 16)));
1480
1481 return index;
1482 } else if (devinfo->gen >= 8) {
1483 /* Store the offset in a GRF so we can send-from-GRF. */
1484 src_reg offset = src_reg(this, glsl_type::int_type);
1485 emit_before(block, inst, MOV(dst_reg(offset), brw_imm_d(reg_offset * 16)));
1486 return offset;
1487 } else {
1488 return brw_imm_d(reg_offset * 16);
1489 }
1490 }
1491
1492 /**
1493 * Emits an instruction before @inst to load the value named by @orig_src
1494 * from scratch space at @base_offset to @temp.
1495 *
1496 * @base_offset is measured in 32-byte units (the size of a register).
1497 */
1498 void
1499 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1500 dst_reg temp, src_reg orig_src,
1501 int base_offset)
1502 {
1503 int reg_offset = base_offset + orig_src.reg_offset;
1504 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1505 reg_offset);
1506
1507 emit_before(block, inst, SCRATCH_READ(temp, index));
1508 }
1509
1510 /**
1511 * Emits an instruction after @inst to store the value to be written
1512 * to @orig_dst to scratch space at @base_offset, from @temp.
1513 *
1514 * @base_offset is measured in 32-byte units (the size of a register).
1515 */
1516 void
1517 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1518 int base_offset)
1519 {
1520 int reg_offset = base_offset + inst->dst.reg_offset;
1521 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1522 reg_offset);
1523
1524 /* Create a temporary register to store *inst's result in.
1525 *
1526 * We have to be careful in MOVing from our temporary result register in
1527 * the scratch write. If we swizzle from channels of the temporary that
1528 * weren't initialized, it will confuse live interval analysis, which will
1529 * make spilling fail to make progress.
1530 */
1531 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1532 inst->dst.type),
1533 brw_swizzle_for_mask(inst->dst.writemask));
1534 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1535 inst->dst.writemask));
1536 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1537 if (inst->opcode != BRW_OPCODE_SEL)
1538 write->predicate = inst->predicate;
1539 write->ir = inst->ir;
1540 write->annotation = inst->annotation;
1541 inst->insert_after(block, write);
1542
1543 inst->dst.file = temp.file;
1544 inst->dst.nr = temp.nr;
1545 inst->dst.reg_offset = temp.reg_offset;
1546 inst->dst.reladdr = NULL;
1547 }
1548
1549 /**
1550 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1551 * adds the scratch read(s) before \p inst. The function also checks for
1552 * recursive reladdr scratch accesses, issuing the corresponding scratch
1553 * loads and rewriting reladdr references accordingly.
1554 *
1555 * \return \p src if it did not require a scratch load, otherwise, the
1556 * register holding the result of the scratch load that the caller should
1557 * use to rewrite src.
1558 */
1559 src_reg
1560 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1561 vec4_instruction *inst, src_reg src)
1562 {
1563 /* Resolve recursive reladdr scratch access by calling ourselves
1564 * with src.reladdr
1565 */
1566 if (src.reladdr)
1567 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1568 *src.reladdr);
1569
1570 /* Now handle scratch access on src */
1571 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1572 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1573 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1574 src.nr = temp.nr;
1575 src.reg_offset = temp.reg_offset;
1576 src.reladdr = NULL;
1577 }
1578
1579 return src;
1580 }
1581
1582 /**
1583 * We can't generally support array access in GRF space, because a
1584 * single instruction's destination can only span 2 contiguous
1585 * registers. So, we send all GRF arrays that get variable index
1586 * access to scratch space.
1587 */
1588 void
1589 vec4_visitor::move_grf_array_access_to_scratch()
1590 {
1591 int scratch_loc[this->alloc.count];
1592 memset(scratch_loc, -1, sizeof(scratch_loc));
1593
1594 /* First, calculate the set of virtual GRFs that need to be punted
1595 * to scratch due to having any array access on them, and where in
1596 * scratch.
1597 */
1598 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1599 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1600 if (scratch_loc[inst->dst.nr] == -1) {
1601 scratch_loc[inst->dst.nr] = last_scratch;
1602 last_scratch += this->alloc.sizes[inst->dst.nr];
1603 }
1604
1605 for (src_reg *iter = inst->dst.reladdr;
1606 iter->reladdr;
1607 iter = iter->reladdr) {
1608 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1609 scratch_loc[iter->nr] = last_scratch;
1610 last_scratch += this->alloc.sizes[iter->nr];
1611 }
1612 }
1613 }
1614
1615 for (int i = 0 ; i < 3; i++) {
1616 for (src_reg *iter = &inst->src[i];
1617 iter->reladdr;
1618 iter = iter->reladdr) {
1619 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1620 scratch_loc[iter->nr] = last_scratch;
1621 last_scratch += this->alloc.sizes[iter->nr];
1622 }
1623 }
1624 }
1625 }
1626
1627 /* Now, for anything that will be accessed through scratch, rewrite
1628 * it to load/store. Note that this is a _safe list walk, because
1629 * we may generate a new scratch_write instruction after the one
1630 * we're processing.
1631 */
1632 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1633 /* Set up the annotation tracking for new generated instructions. */
1634 base_ir = inst->ir;
1635 current_annotation = inst->annotation;
1636
1637 /* First handle scratch access on the dst. Notice we have to handle
1638 * the case where the dst's reladdr also points to scratch space.
1639 */
1640 if (inst->dst.reladdr)
1641 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1642 *inst->dst.reladdr);
1643
1644 /* Now that we have handled any (possibly recursive) reladdr scratch
1645 * accesses for dst we can safely do the scratch write for dst itself
1646 */
1647 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1648 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1649
1650 /* Now handle scratch access on any src. In this case, since inst->src[i]
1651 * already is a src_reg, we can just call emit_resolve_reladdr with
1652 * inst->src[i] and it will take care of handling scratch loads for
1653 * both src and src.reladdr (recursively).
1654 */
1655 for (int i = 0 ; i < 3; i++) {
1656 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1657 inst->src[i]);
1658 }
1659 }
1660 }
1661
1662 /**
1663 * Emits an instruction before @inst to load the value named by @orig_src
1664 * from the pull constant buffer (surface) at @base_offset to @temp.
1665 */
1666 void
1667 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1668 dst_reg temp, src_reg orig_src,
1669 int base_offset)
1670 {
1671 int reg_offset = base_offset + orig_src.reg_offset;
1672 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1673 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
1674 reg_offset);
1675
1676 emit_pull_constant_load_reg(temp,
1677 brw_imm_ud(index),
1678 offset,
1679 block, inst);
1680
1681 brw_mark_surface_used(&prog_data->base, index);
1682 }
1683
1684 /**
1685 * Implements array access of uniforms by inserting a
1686 * PULL_CONSTANT_LOAD instruction.
1687 *
1688 * Unlike temporary GRF array access (where we don't support it due to
1689 * the difficulty of doing relative addressing on instruction
1690 * destinations), we could potentially do array access of uniforms
1691 * that were loaded in GRF space as push constants. In real-world
1692 * usage we've seen, though, the arrays being used are always larger
1693 * than we could load as push constants, so just always move all
1694 * uniform array access out to a pull constant buffer.
1695 */
1696 void
1697 vec4_visitor::move_uniform_array_access_to_pull_constants()
1698 {
1699 int pull_constant_loc[this->uniforms];
1700 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1701 bool nested_reladdr;
1702
1703 /* Walk through and find array access of uniforms. Put a copy of that
1704 * uniform in the pull constant buffer.
1705 *
1706 * Note that we don't move constant-indexed accesses to arrays. No
1707 * testing has been done of the performance impact of this choice.
1708 */
1709 do {
1710 nested_reladdr = false;
1711
1712 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1713 for (int i = 0 ; i < 3; i++) {
1714 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1715 continue;
1716
1717 int uniform = inst->src[i].nr;
1718
1719 if (inst->src[i].reladdr->reladdr)
1720 nested_reladdr = true; /* will need another pass */
1721
1722 /* If this array isn't already present in the pull constant buffer,
1723 * add it.
1724 */
1725 if (pull_constant_loc[uniform] == -1) {
1726 const gl_constant_value **values =
1727 &stage_prog_data->param[uniform * 4];
1728
1729 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
1730
1731 assert(uniform < uniform_array_size);
1732 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
1733 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1734 = values[j];
1735 }
1736 }
1737
1738 /* Set up the annotation tracking for new generated instructions. */
1739 base_ir = inst->ir;
1740 current_annotation = inst->annotation;
1741
1742 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1743
1744 emit_pull_constant_load(block, inst, temp, inst->src[i],
1745 pull_constant_loc[uniform]);
1746
1747 inst->src[i].file = temp.file;
1748 inst->src[i].nr = temp.nr;
1749 inst->src[i].reg_offset = temp.reg_offset;
1750 inst->src[i].reladdr = NULL;
1751 }
1752 }
1753 } while (nested_reladdr);
1754
1755 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1756 * no need to track them as larger-than-vec4 objects. This will be
1757 * relied on in cutting out unused uniform vectors from push
1758 * constants.
1759 */
1760 split_uniform_registers();
1761 }
1762
1763 void
1764 vec4_visitor::resolve_ud_negate(src_reg *reg)
1765 {
1766 if (reg->type != BRW_REGISTER_TYPE_UD ||
1767 !reg->negate)
1768 return;
1769
1770 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1771 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1772 *reg = temp;
1773 }
1774
1775 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1776 void *log_data,
1777 const struct brw_sampler_prog_key_data *key_tex,
1778 struct brw_vue_prog_data *prog_data,
1779 const nir_shader *shader,
1780 void *mem_ctx,
1781 bool no_spills,
1782 int shader_time_index)
1783 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1784 key_tex(key_tex),
1785 prog_data(prog_data),
1786 fail_msg(NULL),
1787 first_non_payload_grf(0),
1788 need_all_constants_in_pull_buffer(false),
1789 no_spills(no_spills),
1790 shader_time_index(shader_time_index),
1791 last_scratch(0)
1792 {
1793 this->failed = false;
1794
1795 this->base_ir = NULL;
1796 this->current_annotation = NULL;
1797 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1798
1799 this->virtual_grf_start = NULL;
1800 this->virtual_grf_end = NULL;
1801 this->live_intervals = NULL;
1802
1803 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1804
1805 this->uniforms = 0;
1806
1807 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
1808 * at least one. See setup_uniforms() in brw_vec4.cpp.
1809 */
1810 this->uniform_array_size = 1;
1811 if (prog_data) {
1812 this->uniform_array_size =
1813 MAX2(DIV_ROUND_UP(stage_prog_data->nr_params, 4), 1);
1814 }
1815
1816 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
1817 }
1818
1819 vec4_visitor::~vec4_visitor()
1820 {
1821 }
1822
1823
1824 void
1825 vec4_visitor::fail(const char *format, ...)
1826 {
1827 va_list va;
1828 char *msg;
1829
1830 if (failed)
1831 return;
1832
1833 failed = true;
1834
1835 va_start(va, format);
1836 msg = ralloc_vasprintf(mem_ctx, format, va);
1837 va_end(va);
1838 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1839
1840 this->fail_msg = msg;
1841
1842 if (debug_enabled) {
1843 fprintf(stderr, "%s", msg);
1844 }
1845 }
1846
1847 } /* namespace brw */