i965/vec4: fix size_written for doubles
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "brw_eu.h"
27 #include "brw_program.h"
28
29 namespace brw {
30
31 vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
32 const src_reg &src0, const src_reg &src1,
33 const src_reg &src2)
34 {
35 this->opcode = opcode;
36 this->dst = dst;
37 this->src[0] = src0;
38 this->src[1] = src1;
39 this->src[2] = src2;
40 this->saturate = false;
41 this->force_writemask_all = false;
42 this->no_dd_clear = false;
43 this->no_dd_check = false;
44 this->writes_accumulator = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->predicate = BRW_PREDICATE_NONE;
47 this->predicate_inverse = false;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = NULL;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_size = 0;
53 this->flag_subreg = 0;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->exec_size = 8;
58 this->size_written = (dst.file == BAD_FILE ?
59 0 : this->exec_size * type_sz(dst.type));
60 this->annotation = NULL;
61 }
62
63 vec4_instruction *
64 vec4_visitor::emit(vec4_instruction *inst)
65 {
66 inst->ir = this->base_ir;
67 inst->annotation = this->current_annotation;
68
69 this->instructions.push_tail(inst);
70
71 return inst;
72 }
73
74 vec4_instruction *
75 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
76 vec4_instruction *new_inst)
77 {
78 new_inst->ir = inst->ir;
79 new_inst->annotation = inst->annotation;
80
81 inst->insert_before(block, new_inst);
82
83 return inst;
84 }
85
86 vec4_instruction *
87 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
88 const src_reg &src1, const src_reg &src2)
89 {
90 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
91 }
92
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
96 const src_reg &src1)
97 {
98 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
103 {
104 return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
109 {
110 return emit(new(mem_ctx) vec4_instruction(opcode, dst));
111 }
112
113 vec4_instruction *
114 vec4_visitor::emit(enum opcode opcode)
115 {
116 return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
117 }
118
119 #define ALU1(op) \
120 vec4_instruction * \
121 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
122 { \
123 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
124 }
125
126 #define ALU2(op) \
127 vec4_instruction * \
128 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
129 const src_reg &src1) \
130 { \
131 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
132 src0, src1); \
133 }
134
135 #define ALU2_ACC(op) \
136 vec4_instruction * \
137 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
138 const src_reg &src1) \
139 { \
140 vec4_instruction *inst = new(mem_ctx) vec4_instruction( \
141 BRW_OPCODE_##op, dst, src0, src1); \
142 inst->writes_accumulator = true; \
143 return inst; \
144 }
145
146 #define ALU3(op) \
147 vec4_instruction * \
148 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
149 const src_reg &src1, const src_reg &src2) \
150 { \
151 assert(devinfo->gen >= 6); \
152 return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, \
153 src0, src1, src2); \
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU1(F32TO16)
163 ALU1(F16TO32)
164 ALU2(ADD)
165 ALU2(MUL)
166 ALU2_ACC(MACH)
167 ALU2(AND)
168 ALU2(OR)
169 ALU2(XOR)
170 ALU2(DP3)
171 ALU2(DP4)
172 ALU2(DPH)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(MAC)
188 ALU1(DIM)
189
190 /** Gen4 predicated IF. */
191 vec4_instruction *
192 vec4_visitor::IF(enum brw_predicate predicate)
193 {
194 vec4_instruction *inst;
195
196 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
197 inst->predicate = predicate;
198
199 return inst;
200 }
201
202 /** Gen6 IF with embedded comparison. */
203 vec4_instruction *
204 vec4_visitor::IF(src_reg src0, src_reg src1,
205 enum brw_conditional_mod condition)
206 {
207 assert(devinfo->gen == 6);
208
209 vec4_instruction *inst;
210
211 resolve_ud_negate(&src0);
212 resolve_ud_negate(&src1);
213
214 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
215 src0, src1);
216 inst->conditional_mod = condition;
217
218 return inst;
219 }
220
221 /**
222 * CMP: Sets the low bit of the destination channels with the result
223 * of the comparison, while the upper bits are undefined, and updates
224 * the flag register with the packed 16 bits of the result.
225 */
226 vec4_instruction *
227 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
228 enum brw_conditional_mod condition)
229 {
230 vec4_instruction *inst;
231
232 /* Take the instruction:
233 *
234 * CMP null<d> src0<f> src1<f>
235 *
236 * Original gen4 does type conversion to the destination type before
237 * comparison, producing garbage results for floating point comparisons.
238 *
239 * The destination type doesn't matter on newer generations, so we set the
240 * type to match src0 so we can compact the instruction.
241 */
242 dst.type = src0.type;
243
244 resolve_ud_negate(&src0);
245 resolve_ud_negate(&src1);
246
247 inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
248 inst->conditional_mod = condition;
249
250 return inst;
251 }
252
253 vec4_instruction *
254 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
255 {
256 vec4_instruction *inst;
257
258 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
259 dst, index);
260 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
261 inst->mlen = 2;
262
263 return inst;
264 }
265
266 vec4_instruction *
267 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
268 const src_reg &index)
269 {
270 vec4_instruction *inst;
271
272 inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
273 dst, src, index);
274 inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
275 inst->mlen = 3;
276
277 return inst;
278 }
279
280 src_reg
281 vec4_visitor::fix_3src_operand(const src_reg &src)
282 {
283 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
284 * able to use vertical stride of zero to replicate the vec4 uniform, like
285 *
286 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
287 *
288 * But you can't, since vertical stride is always four in three-source
289 * instructions. Instead, insert a MOV instruction to do the replication so
290 * that the three-source instruction can consume it.
291 */
292
293 /* The MOV is only needed if the source is a uniform or immediate. */
294 if (src.file != UNIFORM && src.file != IMM)
295 return src;
296
297 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
298 return src;
299
300 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
301 expanded.type = src.type;
302 emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
303 return src_reg(expanded);
304 }
305
306 src_reg
307 vec4_visitor::resolve_source_modifiers(const src_reg &src)
308 {
309 if (!src.abs && !src.negate)
310 return src;
311
312 dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
313 resolved.type = src.type;
314 emit(MOV(resolved, src));
315
316 return src_reg(resolved);
317 }
318
319 src_reg
320 vec4_visitor::fix_math_operand(const src_reg &src)
321 {
322 if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
323 return src;
324
325 /* The gen6 math instruction ignores the source modifiers --
326 * swizzle, abs, negate, and at least some parts of the register
327 * region description.
328 *
329 * Rather than trying to enumerate all these cases, *always* expand the
330 * operand to a temp GRF for gen6.
331 *
332 * For gen7, keep the operand as-is, except if immediate, which gen7 still
333 * can't use.
334 */
335
336 if (devinfo->gen == 7 && src.file != IMM)
337 return src;
338
339 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
340 expanded.type = src.type;
341 emit(MOV(expanded, src));
342 return src_reg(expanded);
343 }
344
345 vec4_instruction *
346 vec4_visitor::emit_math(enum opcode opcode,
347 const dst_reg &dst,
348 const src_reg &src0, const src_reg &src1)
349 {
350 vec4_instruction *math =
351 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
352
353 if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
354 /* MATH on Gen6 must be align1, so we can't do writemasks. */
355 math->dst = dst_reg(this, glsl_type::vec4_type);
356 math->dst.type = dst.type;
357 math = emit(MOV(dst, src_reg(math->dst)));
358 } else if (devinfo->gen < 6) {
359 math->base_mrf = 1;
360 math->mlen = src1.file == BAD_FILE ? 1 : 2;
361 }
362
363 return math;
364 }
365
366 void
367 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
368 {
369 if (devinfo->gen < 7) {
370 unreachable("ir_unop_pack_half_2x16 should be lowered");
371 }
372
373 assert(dst.type == BRW_REGISTER_TYPE_UD);
374 assert(src0.type == BRW_REGISTER_TYPE_F);
375
376 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
377 *
378 * Because this instruction does not have a 16-bit floating-point type,
379 * the destination data type must be Word (W).
380 *
381 * The destination must be DWord-aligned and specify a horizontal stride
382 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
383 * each destination channel and the upper word is not modified.
384 *
385 * The above restriction implies that the f32to16 instruction must use
386 * align1 mode, because only in align1 mode is it possible to specify
387 * horizontal stride. We choose here to defy the hardware docs and emit
388 * align16 instructions.
389 *
390 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
391 * instructions. I was partially successful in that the code passed all
392 * tests. However, the code was dubiously correct and fragile, and the
393 * tests were not harsh enough to probe that frailty. Not trusting the
394 * code, I chose instead to remain in align16 mode in defiance of the hw
395 * docs).
396 *
397 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
398 * simulator, emitting a f32to16 in align16 mode with UD as destination
399 * data type is safe. The behavior differs from that specified in the PRM
400 * in that the upper word of each destination channel is cleared to 0.
401 */
402
403 dst_reg tmp_dst(this, glsl_type::uvec2_type);
404 src_reg tmp_src(tmp_dst);
405
406 #if 0
407 /* Verify the undocumented behavior on which the following instructions
408 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
409 * then the result of the bit-or instruction below will be incorrect.
410 *
411 * You should inspect the disasm output in order to verify that the MOV is
412 * not optimized away.
413 */
414 emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
415 #endif
416
417 /* Give tmp the form below, where "." means untouched.
418 *
419 * w z y x w z y x
420 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
421 *
422 * That the upper word of each write-channel be 0 is required for the
423 * following bit-shift and bit-or instructions to work. Note that this
424 * relies on the undocumented hardware behavior mentioned above.
425 */
426 tmp_dst.writemask = WRITEMASK_XY;
427 emit(F32TO16(tmp_dst, src0));
428
429 /* Give the write-channels of dst the form:
430 * 0xhhhh0000
431 */
432 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
433 emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
434
435 /* Finally, give the write-channels of dst the form of packHalf2x16's
436 * output:
437 * 0xhhhhllll
438 */
439 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
440 emit(OR(dst, src_reg(dst), tmp_src));
441 }
442
443 void
444 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
445 {
446 if (devinfo->gen < 7) {
447 unreachable("ir_unop_unpack_half_2x16 should be lowered");
448 }
449
450 assert(dst.type == BRW_REGISTER_TYPE_F);
451 assert(src0.type == BRW_REGISTER_TYPE_UD);
452
453 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
454 *
455 * Because this instruction does not have a 16-bit floating-point type,
456 * the source data type must be Word (W). The destination type must be
457 * F (Float).
458 *
459 * To use W as the source data type, we must adjust horizontal strides,
460 * which is only possible in align1 mode. All my [chadv] attempts at
461 * emitting align1 instructions for unpackHalf2x16 failed to pass the
462 * Piglit tests, so I gave up.
463 *
464 * I've verified that, on gen7 hardware and the simulator, it is safe to
465 * emit f16to32 in align16 mode with UD as source data type.
466 */
467
468 dst_reg tmp_dst(this, glsl_type::uvec2_type);
469 src_reg tmp_src(tmp_dst);
470
471 tmp_dst.writemask = WRITEMASK_X;
472 emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
473
474 tmp_dst.writemask = WRITEMASK_Y;
475 emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
476
477 dst.writemask = WRITEMASK_XY;
478 emit(F16TO32(dst, tmp_src));
479 }
480
481 void
482 vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
483 {
484 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
485 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
486 * is not suitable to generate the shift values, but we can use the packed
487 * vector float and a type-converting MOV.
488 */
489 dst_reg shift(this, glsl_type::uvec4_type);
490 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
491
492 dst_reg shifted(this, glsl_type::uvec4_type);
493 src0.swizzle = BRW_SWIZZLE_XXXX;
494 emit(SHR(shifted, src0, src_reg(shift)));
495
496 shifted.type = BRW_REGISTER_TYPE_UB;
497 dst_reg f(this, glsl_type::vec4_type);
498 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
499
500 emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
501 }
502
503 void
504 vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
505 {
506 /* Instead of splitting the 32-bit integer, shifting, and ORing it back
507 * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
508 * is not suitable to generate the shift values, but we can use the packed
509 * vector float and a type-converting MOV.
510 */
511 dst_reg shift(this, glsl_type::uvec4_type);
512 emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
513
514 dst_reg shifted(this, glsl_type::uvec4_type);
515 src0.swizzle = BRW_SWIZZLE_XXXX;
516 emit(SHR(shifted, src0, src_reg(shift)));
517
518 shifted.type = BRW_REGISTER_TYPE_B;
519 dst_reg f(this, glsl_type::vec4_type);
520 emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
521
522 dst_reg scaled(this, glsl_type::vec4_type);
523 emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
524
525 dst_reg max(this, glsl_type::vec4_type);
526 emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
527 emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
528 }
529
530 void
531 vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
532 {
533 dst_reg saturated(this, glsl_type::vec4_type);
534 vec4_instruction *inst = emit(MOV(saturated, src0));
535 inst->saturate = true;
536
537 dst_reg scaled(this, glsl_type::vec4_type);
538 emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
539
540 dst_reg rounded(this, glsl_type::vec4_type);
541 emit(RNDE(rounded, src_reg(scaled)));
542
543 dst_reg u(this, glsl_type::uvec4_type);
544 emit(MOV(u, src_reg(rounded)));
545
546 src_reg bytes(u);
547 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
548 }
549
550 void
551 vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
552 {
553 dst_reg max(this, glsl_type::vec4_type);
554 emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
555
556 dst_reg min(this, glsl_type::vec4_type);
557 emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
558
559 dst_reg scaled(this, glsl_type::vec4_type);
560 emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
561
562 dst_reg rounded(this, glsl_type::vec4_type);
563 emit(RNDE(rounded, src_reg(scaled)));
564
565 dst_reg i(this, glsl_type::ivec4_type);
566 emit(MOV(i, src_reg(rounded)));
567
568 src_reg bytes(i);
569 emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
570 }
571
572 /*
573 * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
574 * false) elements needed to pack a type.
575 */
576 static int
577 type_size_xvec4(const struct glsl_type *type, bool as_vec4)
578 {
579 unsigned int i;
580 int size;
581
582 switch (type->base_type) {
583 case GLSL_TYPE_UINT:
584 case GLSL_TYPE_INT:
585 case GLSL_TYPE_FLOAT:
586 case GLSL_TYPE_BOOL:
587 case GLSL_TYPE_DOUBLE:
588 if (type->is_matrix()) {
589 const glsl_type *col_type = type->column_type();
590 unsigned col_slots =
591 (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
592 return type->matrix_columns * col_slots;
593 } else {
594 /* Regardless of size of vector, it gets a vec4. This is bad
595 * packing for things like floats, but otherwise arrays become a
596 * mess. Hopefully a later pass over the code can pack scalars
597 * down if appropriate.
598 */
599 return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
600 }
601 case GLSL_TYPE_ARRAY:
602 assert(type->length > 0);
603 return type_size_xvec4(type->fields.array, as_vec4) * type->length;
604 case GLSL_TYPE_STRUCT:
605 size = 0;
606 for (i = 0; i < type->length; i++) {
607 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
608 }
609 return size;
610 case GLSL_TYPE_SUBROUTINE:
611 return 1;
612
613 case GLSL_TYPE_SAMPLER:
614 /* Samplers take up no register space, since they're baked in at
615 * link time.
616 */
617 return 0;
618 case GLSL_TYPE_ATOMIC_UINT:
619 return 0;
620 case GLSL_TYPE_IMAGE:
621 return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
622 case GLSL_TYPE_VOID:
623 case GLSL_TYPE_ERROR:
624 case GLSL_TYPE_INTERFACE:
625 case GLSL_TYPE_FUNCTION:
626 unreachable("not reached");
627 }
628
629 return 0;
630 }
631
632 /**
633 * Returns the minimum number of vec4 elements needed to pack a type.
634 *
635 * For simple types, it will return 1 (a single vec4); for matrices, the
636 * number of columns; for array and struct, the sum of the vec4_size of
637 * each of its elements; and for sampler and atomic, zero.
638 *
639 * This method is useful to calculate how much register space is needed to
640 * store a particular type.
641 */
642 extern "C" int
643 type_size_vec4(const struct glsl_type *type)
644 {
645 return type_size_xvec4(type, true);
646 }
647
648 /**
649 * Returns the minimum number of dvec4 elements needed to pack a type.
650 *
651 * For simple types, it will return 1 (a single dvec4); for matrices, the
652 * number of columns; for array and struct, the sum of the dvec4_size of
653 * each of its elements; and for sampler and atomic, zero.
654 *
655 * This method is useful to calculate how much register space is needed to
656 * store a particular type.
657 *
658 * Measuring double-precision vertex inputs as dvec4 is required because
659 * ARB_vertex_attrib_64bit states that these uses the same number of locations
660 * than the single-precision version. That is, two consecutives dvec4 would be
661 * located in location "x" and location "x+1", not "x+2".
662 *
663 * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
664 * remap_vs_attrs() will take in account both the location and also if the
665 * type fits in one or two vec4 slots.
666 */
667 extern "C" int
668 type_size_dvec4(const struct glsl_type *type)
669 {
670 return type_size_xvec4(type, false);
671 }
672
673 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
674 {
675 init();
676
677 this->file = VGRF;
678 this->nr = v->alloc.allocate(type_size_vec4(type));
679
680 if (type->is_array() || type->is_record()) {
681 this->swizzle = BRW_SWIZZLE_NOOP;
682 } else {
683 this->swizzle = brw_swizzle_for_size(type->vector_elements);
684 }
685
686 this->type = brw_type_for_base_type(type);
687 }
688
689 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
690 {
691 assert(size > 0);
692
693 init();
694
695 this->file = VGRF;
696 this->nr = v->alloc.allocate(type_size_vec4(type) * size);
697
698 this->swizzle = BRW_SWIZZLE_NOOP;
699
700 this->type = brw_type_for_base_type(type);
701 }
702
703 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
704 {
705 init();
706
707 this->file = VGRF;
708 this->nr = v->alloc.allocate(type_size_vec4(type));
709
710 if (type->is_array() || type->is_record()) {
711 this->writemask = WRITEMASK_XYZW;
712 } else {
713 this->writemask = (1 << type->vector_elements) - 1;
714 }
715
716 this->type = brw_type_for_base_type(type);
717 }
718
719 vec4_instruction *
720 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
721 src_reg src0, src_reg src1)
722 {
723 vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
724 inst->conditional_mod = conditionalmod;
725 return inst;
726 }
727
728 vec4_instruction *
729 vec4_visitor::emit_lrp(const dst_reg &dst,
730 const src_reg &x, const src_reg &y, const src_reg &a)
731 {
732 if (devinfo->gen >= 6) {
733 /* Note that the instruction's argument order is reversed from GLSL
734 * and the IR.
735 */
736 return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
737 fix_3src_operand(x)));
738 } else {
739 /* Earlier generations don't support three source operations, so we
740 * need to emit x*(1-a) + y*a.
741 */
742 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
743 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
744 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
745 y_times_a.writemask = dst.writemask;
746 one_minus_a.writemask = dst.writemask;
747 x_times_one_minus_a.writemask = dst.writemask;
748
749 emit(MUL(y_times_a, y, a));
750 emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
751 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
752 return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
753 }
754 }
755
756 /**
757 * Emits the instructions needed to perform a pull constant load. before_block
758 * and before_inst can be NULL in which case the instruction will be appended
759 * to the end of the instruction list.
760 */
761 void
762 vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
763 src_reg surf_index,
764 src_reg offset_reg,
765 bblock_t *before_block,
766 vec4_instruction *before_inst)
767 {
768 assert((before_inst == NULL && before_block == NULL) ||
769 (before_inst && before_block));
770
771 vec4_instruction *pull;
772
773 if (devinfo->gen >= 9) {
774 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
775 src_reg header(this, glsl_type::uvec4_type, 2);
776
777 pull = new(mem_ctx)
778 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
779 dst_reg(header));
780
781 if (before_inst)
782 emit_before(before_block, before_inst, pull);
783 else
784 emit(pull);
785
786 dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
787 offset_reg.type);
788 pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
789
790 if (before_inst)
791 emit_before(before_block, before_inst, pull);
792 else
793 emit(pull);
794
795 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
796 dst,
797 surf_index,
798 header);
799 pull->mlen = 2;
800 pull->header_size = 1;
801 } else if (devinfo->gen >= 7) {
802 dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
803
804 grf_offset.type = offset_reg.type;
805
806 pull = MOV(grf_offset, offset_reg);
807
808 if (before_inst)
809 emit_before(before_block, before_inst, pull);
810 else
811 emit(pull);
812
813 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
814 dst,
815 surf_index,
816 src_reg(grf_offset));
817 pull->mlen = 1;
818 } else {
819 pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
820 dst,
821 surf_index,
822 offset_reg);
823 pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
824 pull->mlen = 1;
825 }
826
827 if (before_inst)
828 emit_before(before_block, before_inst, pull);
829 else
830 emit(pull);
831 }
832
833 src_reg
834 vec4_visitor::emit_uniformize(const src_reg &src)
835 {
836 const src_reg chan_index(this, glsl_type::uint_type);
837 const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
838 src.type);
839
840 emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
841 ->force_writemask_all = true;
842 emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
843 ->force_writemask_all = true;
844
845 return src_reg(dst);
846 }
847
848 src_reg
849 vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
850 src_reg coordinate, src_reg surface)
851 {
852 vec4_instruction *inst =
853 new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
854 dst_reg(this, glsl_type::uvec4_type));
855 inst->base_mrf = 2;
856 inst->src[1] = surface;
857 inst->src[2] = surface;
858
859 int param_base;
860
861 if (devinfo->gen >= 9) {
862 /* Gen9+ needs a message header in order to use SIMD4x2 mode */
863 vec4_instruction *header_inst = new(mem_ctx)
864 vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
865 dst_reg(MRF, inst->base_mrf));
866
867 emit(header_inst);
868
869 inst->mlen = 2;
870 inst->header_size = 1;
871 param_base = inst->base_mrf + 1;
872 } else {
873 inst->mlen = 1;
874 param_base = inst->base_mrf;
875 }
876
877 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
878 int coord_mask = (1 << coordinate_type->vector_elements) - 1;
879 int zero_mask = 0xf & ~coord_mask;
880
881 emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
882 coordinate));
883
884 emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
885 brw_imm_d(0)));
886
887 emit(inst);
888 return src_reg(inst->dst);
889 }
890
891 bool
892 vec4_visitor::is_high_sampler(src_reg sampler)
893 {
894 if (devinfo->gen < 8 && !devinfo->is_haswell)
895 return false;
896
897 return sampler.file != IMM || sampler.ud >= 16;
898 }
899
900 void
901 vec4_visitor::emit_texture(ir_texture_opcode op,
902 dst_reg dest,
903 const glsl_type *dest_type,
904 src_reg coordinate,
905 int coord_components,
906 src_reg shadow_comparator,
907 src_reg lod, src_reg lod2,
908 src_reg sample_index,
909 uint32_t constant_offset,
910 src_reg offset_value,
911 src_reg mcs,
912 uint32_t surface,
913 src_reg surface_reg,
914 src_reg sampler_reg)
915 {
916 /* The sampler can only meaningfully compute LOD for fragment shader
917 * messages. For all other stages, we change the opcode to TXL and hardcode
918 * the LOD to 0.
919 *
920 * textureQueryLevels() is implemented in terms of TXS so we need to pass a
921 * valid LOD argument.
922 */
923 if (op == ir_tex || op == ir_query_levels) {
924 assert(lod.file == BAD_FILE);
925 lod = brw_imm_f(0.0f);
926 }
927
928 enum opcode opcode;
929 switch (op) {
930 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
931 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
932 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
933 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
934 case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
935 SHADER_OPCODE_TXF_CMS); break;
936 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
937 case ir_tg4: opcode = offset_value.file != BAD_FILE
938 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
939 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
940 case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
941 case ir_txb:
942 unreachable("TXB is not valid for vertex shaders.");
943 case ir_lod:
944 unreachable("LOD is not valid for vertex shaders.");
945 case ir_samples_identical: {
946 /* There are some challenges implementing this for vec4, and it seems
947 * unlikely to be used anyway. For now, just return false ways.
948 */
949 emit(MOV(dest, brw_imm_ud(0u)));
950 return;
951 }
952 default:
953 unreachable("Unrecognized tex op");
954 }
955
956 vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
957
958 inst->offset = constant_offset;
959
960 /* The message header is necessary for:
961 * - Gen4 (always)
962 * - Gen9+ for selecting SIMD4x2
963 * - Texel offsets
964 * - Gather channel selection
965 * - Sampler indices too large to fit in a 4-bit value.
966 * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
967 */
968 inst->header_size =
969 (devinfo->gen < 5 || devinfo->gen >= 9 ||
970 inst->offset != 0 || op == ir_tg4 ||
971 op == ir_texture_samples ||
972 is_high_sampler(sampler_reg)) ? 1 : 0;
973 inst->base_mrf = 2;
974 inst->mlen = inst->header_size;
975 inst->dst.writemask = WRITEMASK_XYZW;
976 inst->shadow_compare = shadow_comparator.file != BAD_FILE;
977
978 inst->src[1] = surface_reg;
979 inst->src[2] = sampler_reg;
980
981 /* MRF for the first parameter */
982 int param_base = inst->base_mrf + inst->header_size;
983
984 if (op == ir_txs || op == ir_query_levels) {
985 int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
986 emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
987 inst->mlen++;
988 } else if (op == ir_texture_samples) {
989 inst->dst.writemask = WRITEMASK_X;
990 } else {
991 /* Load the coordinate */
992 /* FINISHME: gl_clamp_mask and saturate */
993 int coord_mask = (1 << coord_components) - 1;
994 int zero_mask = 0xf & ~coord_mask;
995
996 emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
997 coordinate));
998 inst->mlen++;
999
1000 if (zero_mask != 0) {
1001 emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
1002 brw_imm_d(0)));
1003 }
1004 /* Load the shadow comparator */
1005 if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
1006 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
1007 WRITEMASK_X),
1008 shadow_comparator));
1009 inst->mlen++;
1010 }
1011
1012 /* Load the LOD info */
1013 if (op == ir_tex || op == ir_txl) {
1014 int mrf, writemask;
1015 if (devinfo->gen >= 5) {
1016 mrf = param_base + 1;
1017 if (shadow_comparator.file != BAD_FILE) {
1018 writemask = WRITEMASK_Y;
1019 /* mlen already incremented */
1020 } else {
1021 writemask = WRITEMASK_X;
1022 inst->mlen++;
1023 }
1024 } else /* devinfo->gen == 4 */ {
1025 mrf = param_base;
1026 writemask = WRITEMASK_W;
1027 }
1028 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
1029 } else if (op == ir_txf) {
1030 emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
1031 } else if (op == ir_txf_ms) {
1032 emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
1033 sample_index));
1034 if (opcode == SHADER_OPCODE_TXF_CMS_W) {
1035 /* MCS data is stored in the first two channels of ‘mcs’, but we
1036 * need to get it into the .y and .z channels of the second vec4
1037 * of params.
1038 */
1039 mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
1040 emit(MOV(dst_reg(MRF, param_base + 1,
1041 glsl_type::uint_type, WRITEMASK_YZ),
1042 mcs));
1043 } else if (devinfo->gen >= 7) {
1044 /* MCS data is in the first channel of `mcs`, but we need to get it into
1045 * the .y channel of the second vec4 of params, so replicate .x across
1046 * the whole vec4 and then mask off everything except .y
1047 */
1048 mcs.swizzle = BRW_SWIZZLE_XXXX;
1049 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
1050 mcs));
1051 }
1052 inst->mlen++;
1053 } else if (op == ir_txd) {
1054 const brw_reg_type type = lod.type;
1055
1056 if (devinfo->gen >= 5) {
1057 lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1058 lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
1059 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
1060 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
1061 inst->mlen++;
1062
1063 if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
1064 lod.swizzle = BRW_SWIZZLE_ZZZZ;
1065 lod2.swizzle = BRW_SWIZZLE_ZZZZ;
1066 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
1067 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
1068 inst->mlen++;
1069
1070 if (shadow_comparator.file != BAD_FILE) {
1071 emit(MOV(dst_reg(MRF, param_base + 2,
1072 shadow_comparator.type, WRITEMASK_Z),
1073 shadow_comparator));
1074 }
1075 }
1076 } else /* devinfo->gen == 4 */ {
1077 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
1078 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
1079 inst->mlen += 2;
1080 }
1081 } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
1082 if (shadow_comparator.file != BAD_FILE) {
1083 emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
1084 shadow_comparator));
1085 }
1086
1087 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
1088 offset_value));
1089 inst->mlen++;
1090 }
1091 }
1092
1093 emit(inst);
1094
1095 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
1096 * spec requires layers.
1097 */
1098 if (op == ir_txs && devinfo->gen < 7) {
1099 /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
1100 emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
1101 src_reg(inst->dst), brw_imm_d(1));
1102 }
1103
1104 if (devinfo->gen == 6 && op == ir_tg4) {
1105 emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
1106 }
1107
1108 if (op == ir_query_levels) {
1109 /* # levels is in .w */
1110 src_reg swizzled(dest);
1111 swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
1112 SWIZZLE_W, SWIZZLE_W);
1113 emit(MOV(dest, swizzled));
1114 }
1115 }
1116
1117 /**
1118 * Apply workarounds for Gen6 gather with UINT/SINT
1119 */
1120 void
1121 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
1122 {
1123 if (!wa)
1124 return;
1125
1126 int width = (wa & WA_8BIT) ? 8 : 16;
1127 dst_reg dst_f = dst;
1128 dst_f.type = BRW_REGISTER_TYPE_F;
1129
1130 /* Convert from UNORM to UINT */
1131 emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
1132 emit(MOV(dst, src_reg(dst_f)));
1133
1134 if (wa & WA_SIGN) {
1135 /* Reinterpret the UINT value as a signed INT value by
1136 * shifting the sign bit into place, then shifting back
1137 * preserving sign.
1138 */
1139 emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
1140 emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
1141 }
1142 }
1143
1144 void
1145 vec4_visitor::gs_emit_vertex(int /* stream_id */)
1146 {
1147 unreachable("not reached");
1148 }
1149
1150 void
1151 vec4_visitor::gs_end_primitive()
1152 {
1153 unreachable("not reached");
1154 }
1155
1156 void
1157 vec4_visitor::emit_ndc_computation()
1158 {
1159 if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
1160 return;
1161
1162 /* Get the position */
1163 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
1164
1165 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
1166 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
1167 output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
1168 output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
1169
1170 current_annotation = "NDC";
1171 dst_reg ndc_w = ndc;
1172 ndc_w.writemask = WRITEMASK_W;
1173 src_reg pos_w = pos;
1174 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
1175 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
1176
1177 dst_reg ndc_xyz = ndc;
1178 ndc_xyz.writemask = WRITEMASK_XYZ;
1179
1180 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
1181 }
1182
1183 void
1184 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
1185 {
1186 if (devinfo->gen < 6 &&
1187 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
1188 output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
1189 devinfo->has_negative_rhw_bug)) {
1190 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
1191 dst_reg header1_w = header1;
1192 header1_w.writemask = WRITEMASK_W;
1193
1194 emit(MOV(header1, brw_imm_ud(0u)));
1195
1196 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1197 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1198
1199 current_annotation = "Point size";
1200 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
1201 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
1202 }
1203
1204 if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
1205 current_annotation = "Clipping flags";
1206 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
1207 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
1208
1209 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1210 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
1211 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
1212
1213 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1214 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
1215 emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
1216 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
1217 }
1218
1219 /* i965 clipping workaround:
1220 * 1) Test for -ve rhw
1221 * 2) If set,
1222 * set ndc = (0,0,0,0)
1223 * set ucp[6] = 1
1224 *
1225 * Later, clipping will detect ucp[6] and ensure the primitive is
1226 * clipped against all fixed planes.
1227 */
1228 if (devinfo->has_negative_rhw_bug &&
1229 output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
1230 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
1231 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
1232 emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
1233 vec4_instruction *inst;
1234 inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
1235 inst->predicate = BRW_PREDICATE_NORMAL;
1236 output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
1237 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
1238 inst->predicate = BRW_PREDICATE_NORMAL;
1239 }
1240
1241 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
1242 } else if (devinfo->gen < 6) {
1243 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
1244 } else {
1245 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
1246 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
1247 dst_reg reg_w = reg;
1248 reg_w.writemask = WRITEMASK_W;
1249 src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
1250 reg_as_src.type = reg_w.type;
1251 reg_as_src.swizzle = brw_swizzle_for_size(1);
1252 emit(MOV(reg_w, reg_as_src));
1253 }
1254 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
1255 dst_reg reg_y = reg;
1256 reg_y.writemask = WRITEMASK_Y;
1257 reg_y.type = BRW_REGISTER_TYPE_D;
1258 output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
1259 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
1260 }
1261 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
1262 dst_reg reg_z = reg;
1263 reg_z.writemask = WRITEMASK_Z;
1264 reg_z.type = BRW_REGISTER_TYPE_D;
1265 output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
1266 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
1267 }
1268 }
1269 }
1270
1271 vec4_instruction *
1272 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
1273 {
1274 assert(varying < VARYING_SLOT_MAX);
1275
1276 unsigned num_comps = output_num_components[varying][component];
1277 if (num_comps == 0)
1278 return NULL;
1279
1280 assert(output_reg[varying][component].type == reg.type);
1281 current_annotation = output_reg_annotation[varying];
1282 if (output_reg[varying][component].file != BAD_FILE) {
1283 src_reg src = src_reg(output_reg[varying][component]);
1284 src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
1285 reg.writemask =
1286 brw_writemask_for_component_packing(num_comps, component);
1287 return emit(MOV(reg, src));
1288 }
1289 return NULL;
1290 }
1291
1292 void
1293 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
1294 {
1295 reg.type = BRW_REGISTER_TYPE_F;
1296 output_reg[varying][0].type = reg.type;
1297
1298 switch (varying) {
1299 case VARYING_SLOT_PSIZ:
1300 {
1301 /* PSIZ is always in slot 0, and is coupled with other flags. */
1302 current_annotation = "indices, point width, clip flags";
1303 emit_psiz_and_flags(reg);
1304 break;
1305 }
1306 case BRW_VARYING_SLOT_NDC:
1307 current_annotation = "NDC";
1308 if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
1309 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
1310 break;
1311 case VARYING_SLOT_POS:
1312 current_annotation = "gl_Position";
1313 if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
1314 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
1315 break;
1316 case VARYING_SLOT_EDGE:
1317 /* This is present when doing unfilled polygons. We're supposed to copy
1318 * the edge flag from the user-provided vertex array
1319 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
1320 * of that attribute (starts as 1.0f). This is then used in clipping to
1321 * determine which edges should be drawn as wireframe.
1322 */
1323 current_annotation = "edge flag";
1324 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
1325 glsl_type::float_type, WRITEMASK_XYZW))));
1326 break;
1327 case BRW_VARYING_SLOT_PAD:
1328 /* No need to write to this slot */
1329 break;
1330 default:
1331 for (int i = 0; i < 4; i++) {
1332 emit_generic_urb_slot(reg, varying, i);
1333 }
1334 break;
1335 }
1336 }
1337
1338 static int
1339 align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
1340 {
1341 if (devinfo->gen >= 6) {
1342 /* URB data written (does not include the message header reg) must
1343 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1344 * section 5.4.3.2.2: URB_INTERLEAVED.
1345 *
1346 * URB entries are allocated on a multiple of 1024 bits, so an
1347 * extra 128 bits written here to make the end align to 256 is
1348 * no problem.
1349 */
1350 if ((mlen % 2) != 1)
1351 mlen++;
1352 }
1353
1354 return mlen;
1355 }
1356
1357
1358 /**
1359 * Generates the VUE payload plus the necessary URB write instructions to
1360 * output it.
1361 *
1362 * The VUE layout is documented in Volume 2a.
1363 */
1364 void
1365 vec4_visitor::emit_vertex()
1366 {
1367 /* MRF 0 is reserved for the debugger, so start with message header
1368 * in MRF 1.
1369 */
1370 int base_mrf = 1;
1371 int mrf = base_mrf;
1372 /* In the process of generating our URB write message contents, we
1373 * may need to unspill a register or load from an array. Those
1374 * reads would use MRFs 14-15.
1375 */
1376 int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
1377
1378 /* The following assertion verifies that max_usable_mrf causes an
1379 * even-numbered amount of URB write data, which will meet gen6's
1380 * requirements for length alignment.
1381 */
1382 assert ((max_usable_mrf - base_mrf) % 2 == 0);
1383
1384 /* First mrf is the g0-based message header containing URB handles and
1385 * such.
1386 */
1387 emit_urb_write_header(mrf++);
1388
1389 if (devinfo->gen < 6) {
1390 emit_ndc_computation();
1391 }
1392
1393 /* We may need to split this up into several URB writes, so do them in a
1394 * loop.
1395 */
1396 int slot = 0;
1397 bool complete = false;
1398 do {
1399 /* URB offset is in URB row increments, and each of our MRFs is half of
1400 * one of those, since we're doing interleaved writes.
1401 */
1402 int offset = slot / 2;
1403
1404 mrf = base_mrf + 1;
1405 for (; slot < prog_data->vue_map.num_slots; ++slot) {
1406 emit_urb_slot(dst_reg(MRF, mrf++),
1407 prog_data->vue_map.slot_to_varying[slot]);
1408
1409 /* If this was max_usable_mrf, we can't fit anything more into this
1410 * URB WRITE. Same thing if we reached the maximum length available.
1411 */
1412 if (mrf > max_usable_mrf ||
1413 align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
1414 slot++;
1415 break;
1416 }
1417 }
1418
1419 complete = slot >= prog_data->vue_map.num_slots;
1420 current_annotation = "URB write";
1421 vec4_instruction *inst = emit_urb_write_opcode(complete);
1422 inst->base_mrf = base_mrf;
1423 inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
1424 inst->offset += offset;
1425 } while(!complete);
1426 }
1427
1428
1429 src_reg
1430 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
1431 src_reg *reladdr, int reg_offset)
1432 {
1433 /* Because we store the values to scratch interleaved like our
1434 * vertex data, we need to scale the vec4 index by 2.
1435 */
1436 int message_header_scale = 2;
1437
1438 /* Pre-gen6, the message header uses byte offsets instead of vec4
1439 * (16-byte) offset units.
1440 */
1441 if (devinfo->gen < 6)
1442 message_header_scale *= 16;
1443
1444 if (reladdr) {
1445 src_reg index = src_reg(this, glsl_type::int_type);
1446
1447 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
1448 brw_imm_d(reg_offset)));
1449 emit_before(block, inst, MUL(dst_reg(index), index,
1450 brw_imm_d(message_header_scale)));
1451
1452 return index;
1453 } else {
1454 return brw_imm_d(reg_offset * message_header_scale);
1455 }
1456 }
1457
1458 /**
1459 * Emits an instruction before @inst to load the value named by @orig_src
1460 * from scratch space at @base_offset to @temp.
1461 *
1462 * @base_offset is measured in 32-byte units (the size of a register).
1463 */
1464 void
1465 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
1466 dst_reg temp, src_reg orig_src,
1467 int base_offset)
1468 {
1469 assert(orig_src.offset % REG_SIZE == 0);
1470 int reg_offset = base_offset + orig_src.offset / REG_SIZE;
1471 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
1472 reg_offset);
1473
1474 emit_before(block, inst, SCRATCH_READ(temp, index));
1475 }
1476
1477 /**
1478 * Emits an instruction after @inst to store the value to be written
1479 * to @orig_dst to scratch space at @base_offset, from @temp.
1480 *
1481 * @base_offset is measured in 32-byte units (the size of a register).
1482 */
1483 void
1484 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
1485 int base_offset)
1486 {
1487 assert(inst->dst.offset % REG_SIZE == 0);
1488 int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
1489 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
1490 reg_offset);
1491
1492 /* Create a temporary register to store *inst's result in.
1493 *
1494 * We have to be careful in MOVing from our temporary result register in
1495 * the scratch write. If we swizzle from channels of the temporary that
1496 * weren't initialized, it will confuse live interval analysis, which will
1497 * make spilling fail to make progress.
1498 */
1499 const src_reg temp = swizzle(retype(src_reg(this, glsl_type::vec4_type),
1500 inst->dst.type),
1501 brw_swizzle_for_mask(inst->dst.writemask));
1502 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
1503 inst->dst.writemask));
1504 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
1505 if (inst->opcode != BRW_OPCODE_SEL)
1506 write->predicate = inst->predicate;
1507 write->ir = inst->ir;
1508 write->annotation = inst->annotation;
1509 inst->insert_after(block, write);
1510
1511 inst->dst.file = temp.file;
1512 inst->dst.nr = temp.nr;
1513 inst->dst.offset %= REG_SIZE;
1514 inst->dst.reladdr = NULL;
1515 }
1516
1517 /**
1518 * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
1519 * adds the scratch read(s) before \p inst. The function also checks for
1520 * recursive reladdr scratch accesses, issuing the corresponding scratch
1521 * loads and rewriting reladdr references accordingly.
1522 *
1523 * \return \p src if it did not require a scratch load, otherwise, the
1524 * register holding the result of the scratch load that the caller should
1525 * use to rewrite src.
1526 */
1527 src_reg
1528 vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
1529 vec4_instruction *inst, src_reg src)
1530 {
1531 /* Resolve recursive reladdr scratch access by calling ourselves
1532 * with src.reladdr
1533 */
1534 if (src.reladdr)
1535 *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1536 *src.reladdr);
1537
1538 /* Now handle scratch access on src */
1539 if (src.file == VGRF && scratch_loc[src.nr] != -1) {
1540 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
1541 emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
1542 src.nr = temp.nr;
1543 src.offset %= REG_SIZE;
1544 src.reladdr = NULL;
1545 }
1546
1547 return src;
1548 }
1549
1550 /**
1551 * We can't generally support array access in GRF space, because a
1552 * single instruction's destination can only span 2 contiguous
1553 * registers. So, we send all GRF arrays that get variable index
1554 * access to scratch space.
1555 */
1556 void
1557 vec4_visitor::move_grf_array_access_to_scratch()
1558 {
1559 int scratch_loc[this->alloc.count];
1560 memset(scratch_loc, -1, sizeof(scratch_loc));
1561
1562 /* First, calculate the set of virtual GRFs that need to be punted
1563 * to scratch due to having any array access on them, and where in
1564 * scratch.
1565 */
1566 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1567 if (inst->dst.file == VGRF && inst->dst.reladdr) {
1568 if (scratch_loc[inst->dst.nr] == -1) {
1569 scratch_loc[inst->dst.nr] = last_scratch;
1570 last_scratch += this->alloc.sizes[inst->dst.nr];
1571 }
1572
1573 for (src_reg *iter = inst->dst.reladdr;
1574 iter->reladdr;
1575 iter = iter->reladdr) {
1576 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1577 scratch_loc[iter->nr] = last_scratch;
1578 last_scratch += this->alloc.sizes[iter->nr];
1579 }
1580 }
1581 }
1582
1583 for (int i = 0 ; i < 3; i++) {
1584 for (src_reg *iter = &inst->src[i];
1585 iter->reladdr;
1586 iter = iter->reladdr) {
1587 if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
1588 scratch_loc[iter->nr] = last_scratch;
1589 last_scratch += this->alloc.sizes[iter->nr];
1590 }
1591 }
1592 }
1593 }
1594
1595 /* Now, for anything that will be accessed through scratch, rewrite
1596 * it to load/store. Note that this is a _safe list walk, because
1597 * we may generate a new scratch_write instruction after the one
1598 * we're processing.
1599 */
1600 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1601 /* Set up the annotation tracking for new generated instructions. */
1602 base_ir = inst->ir;
1603 current_annotation = inst->annotation;
1604
1605 /* First handle scratch access on the dst. Notice we have to handle
1606 * the case where the dst's reladdr also points to scratch space.
1607 */
1608 if (inst->dst.reladdr)
1609 *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
1610 *inst->dst.reladdr);
1611
1612 /* Now that we have handled any (possibly recursive) reladdr scratch
1613 * accesses for dst we can safely do the scratch write for dst itself
1614 */
1615 if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
1616 emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
1617
1618 /* Now handle scratch access on any src. In this case, since inst->src[i]
1619 * already is a src_reg, we can just call emit_resolve_reladdr with
1620 * inst->src[i] and it will take care of handling scratch loads for
1621 * both src and src.reladdr (recursively).
1622 */
1623 for (int i = 0 ; i < 3; i++) {
1624 inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
1625 inst->src[i]);
1626 }
1627 }
1628 }
1629
1630 /**
1631 * Emits an instruction before @inst to load the value named by @orig_src
1632 * from the pull constant buffer (surface) at @base_offset to @temp.
1633 */
1634 void
1635 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
1636 dst_reg temp, src_reg orig_src,
1637 int base_offset, src_reg indirect)
1638 {
1639 assert(orig_src.offset % 16 == 0);
1640 int reg_offset = base_offset + orig_src.offset / 16;
1641 const unsigned index = prog_data->base.binding_table.pull_constants_start;
1642
1643 src_reg offset;
1644 if (indirect.file != BAD_FILE) {
1645 offset = src_reg(this, glsl_type::uint_type);
1646
1647 emit_before(block, inst, ADD(dst_reg(offset), indirect,
1648 brw_imm_ud(reg_offset * 16)));
1649 } else if (devinfo->gen >= 8) {
1650 /* Store the offset in a GRF so we can send-from-GRF. */
1651 offset = src_reg(this, glsl_type::uint_type);
1652 emit_before(block, inst, MOV(dst_reg(offset), brw_imm_ud(reg_offset * 16)));
1653 } else {
1654 offset = brw_imm_d(reg_offset * 16);
1655 }
1656
1657 emit_pull_constant_load_reg(temp,
1658 brw_imm_ud(index),
1659 offset,
1660 block, inst);
1661
1662 brw_mark_surface_used(&prog_data->base, index);
1663 }
1664
1665 /**
1666 * Implements array access of uniforms by inserting a
1667 * PULL_CONSTANT_LOAD instruction.
1668 *
1669 * Unlike temporary GRF array access (where we don't support it due to
1670 * the difficulty of doing relative addressing on instruction
1671 * destinations), we could potentially do array access of uniforms
1672 * that were loaded in GRF space as push constants. In real-world
1673 * usage we've seen, though, the arrays being used are always larger
1674 * than we could load as push constants, so just always move all
1675 * uniform array access out to a pull constant buffer.
1676 */
1677 void
1678 vec4_visitor::move_uniform_array_access_to_pull_constants()
1679 {
1680 /* The vulkan dirver doesn't support pull constants other than UBOs so
1681 * everything has to be pushed regardless.
1682 */
1683 if (stage_prog_data->pull_param == NULL) {
1684 split_uniform_registers();
1685 return;
1686 }
1687
1688 int pull_constant_loc[this->uniforms];
1689 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
1690
1691 /* First, walk through the instructions and determine which things need to
1692 * be pulled. We mark something as needing to be pulled by setting
1693 * pull_constant_loc to 0.
1694 */
1695 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
1696 /* We only care about MOV_INDIRECT of a uniform */
1697 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1698 inst->src[0].file != UNIFORM)
1699 continue;
1700
1701 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1702
1703 for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
1704 pull_constant_loc[uniform_nr + j] = 0;
1705 }
1706
1707 /* Next, we walk the list of uniforms and assign real pull constant
1708 * locations and set their corresponding entries in pull_param.
1709 */
1710 for (int j = 0; j < this->uniforms; j++) {
1711 if (pull_constant_loc[j] < 0)
1712 continue;
1713
1714 pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
1715
1716 for (int i = 0; i < 4; i++) {
1717 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
1718 = stage_prog_data->param[j * 4 + i];
1719 }
1720 }
1721
1722 /* Finally, we can walk through the instructions and lower MOV_INDIRECT
1723 * instructions to actual uniform pulls.
1724 */
1725 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
1726 /* We only care about MOV_INDIRECT of a uniform */
1727 if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
1728 inst->src[0].file != UNIFORM)
1729 continue;
1730
1731 int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
1732
1733 assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
1734
1735 emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
1736 pull_constant_loc[uniform_nr], inst->src[1]);
1737 inst->remove(block);
1738 }
1739
1740 /* Now there are no accesses of the UNIFORM file with a reladdr, so
1741 * no need to track them as larger-than-vec4 objects. This will be
1742 * relied on in cutting out unused uniform vectors from push
1743 * constants.
1744 */
1745 split_uniform_registers();
1746 }
1747
1748 void
1749 vec4_visitor::resolve_ud_negate(src_reg *reg)
1750 {
1751 if (reg->type != BRW_REGISTER_TYPE_UD ||
1752 !reg->negate)
1753 return;
1754
1755 src_reg temp = src_reg(this, glsl_type::uvec4_type);
1756 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
1757 *reg = temp;
1758 }
1759
1760 vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
1761 void *log_data,
1762 const struct brw_sampler_prog_key_data *key_tex,
1763 struct brw_vue_prog_data *prog_data,
1764 const nir_shader *shader,
1765 void *mem_ctx,
1766 bool no_spills,
1767 int shader_time_index)
1768 : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
1769 key_tex(key_tex),
1770 prog_data(prog_data),
1771 fail_msg(NULL),
1772 first_non_payload_grf(0),
1773 need_all_constants_in_pull_buffer(false),
1774 no_spills(no_spills),
1775 shader_time_index(shader_time_index),
1776 last_scratch(0)
1777 {
1778 this->failed = false;
1779
1780 this->base_ir = NULL;
1781 this->current_annotation = NULL;
1782 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
1783
1784 memset(this->output_num_components, 0, sizeof(this->output_num_components));
1785
1786 this->virtual_grf_start = NULL;
1787 this->virtual_grf_end = NULL;
1788 this->live_intervals = NULL;
1789
1790 this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
1791
1792 this->uniforms = 0;
1793 }
1794
1795 vec4_visitor::~vec4_visitor()
1796 {
1797 }
1798
1799
1800 void
1801 vec4_visitor::fail(const char *format, ...)
1802 {
1803 va_list va;
1804 char *msg;
1805
1806 if (failed)
1807 return;
1808
1809 failed = true;
1810
1811 va_start(va, format);
1812 msg = ralloc_vasprintf(mem_ctx, format, va);
1813 va_end(va);
1814 msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
1815
1816 this->fail_msg = msg;
1817
1818 if (debug_enabled) {
1819 fprintf(stderr, "%s", msg);
1820 }
1821 }
1822
1823 } /* namespace brw */