i965/vec4: Allow writemasking on math instructions on Gen7+.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->writes_accumulator = false;
46 this->conditional_mod = BRW_CONDITIONAL_NONE;
47 this->sampler = 0;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(dst_reg dst, src_reg src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
123 { \
124 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
125 src0, src1); \
126 }
127
128 #define ALU2_ACC(op) \
129 vec4_instruction * \
130 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
131 { \
132 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
133 BRW_OPCODE_##op, dst, src0, src1); \
134 inst->writes_accumulator = true; \
135 return inst; \
136 }
137
138 #define ALU3(op) \
139 vec4_instruction * \
140 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
141 { \
142 assert(brw->gen >= 6); \
143 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
144 src0, src1, src2); \
145 }
146
147 ALU1(NOT)
148 ALU1(MOV)
149 ALU1(FRC)
150 ALU1(RNDD)
151 ALU1(RNDE)
152 ALU1(RNDZ)
153 ALU1(F32TO16)
154 ALU1(F16TO32)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2_ACC(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(DP3)
162 ALU2(DP4)
163 ALU2(DPH)
164 ALU2(SHL)
165 ALU2(SHR)
166 ALU2(ASR)
167 ALU3(LRP)
168 ALU1(BFREV)
169 ALU3(BFE)
170 ALU2(BFI1)
171 ALU3(BFI2)
172 ALU1(FBH)
173 ALU1(FBL)
174 ALU1(CBIT)
175 ALU3(MAD)
176 ALU2_ACC(ADDC)
177 ALU2_ACC(SUBB)
178 ALU2(MAC)
179
180 /** Gen4 predicated IF. */
181 vec4_instruction *
182 vec4_visitor::IF(uint32_t predicate)
183 {
184 vec4_instruction *inst;
185
186 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
187 inst->predicate = predicate;
188
189 return inst;
190 }
191
192 /** Gen6 IF with embedded comparison. */
193 vec4_instruction *
194 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
195 {
196 assert(brw->gen == 6);
197
198 vec4_instruction *inst;
199
200 resolve_ud_negate(&src0);
201 resolve_ud_negate(&src1);
202
203 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
204 src0, src1);
205 inst->conditional_mod = condition;
206
207 return inst;
208 }
209
210 /**
211 * CMP: Sets the low bit of the destination channels with the result
212 * of the comparison, while the upper bits are undefined, and updates
213 * the flag register with the packed 16 bits of the result.
214 */
215 vec4_instruction *
216 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
217 {
218 vec4_instruction *inst;
219
220 /* original gen4 does type conversion to the destination type
221 * before before comparison, producing garbage results for floating
222 * point comparisons.
223 */
224 if (brw->gen == 4) {
225 dst.type = src0.type;
226 if (dst.file == HW_REG)
227 dst.fixed_hw_reg.type = dst.type;
228 }
229
230 resolve_ud_negate(&src0);
231 resolve_ud_negate(&src1);
232
233 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
234 inst->conditional_mod = condition;
235
236 return inst;
237 }
238
239 vec4_instruction *
240 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
241 {
242 vec4_instruction *inst;
243
244 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
245 dst, index);
246 inst->base_mrf = 14;
247 inst->mlen = 2;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
258 dst, src, index);
259 inst->base_mrf = 13;
260 inst->mlen = 3;
261
262 return inst;
263 }
264
265 void
266 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
267 {
268 static enum opcode dot_opcodes[] = {
269 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
270 };
271
272 emit(dot_opcodes[elements - 2], dst, src0, src1);
273 }
274
275 src_reg
276 vec4_visitor::fix_3src_operand(src_reg src)
277 {
278 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
279 * able to use vertical stride of zero to replicate the vec4 uniform, like
280 *
281 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
282 *
283 * But you can't, since vertical stride is always four in three-source
284 * instructions. Instead, insert a MOV instruction to do the replication so
285 * that the three-source instruction can consume it.
286 */
287
288 /* The MOV is only needed if the source is a uniform or immediate. */
289 if (src.file != UNIFORM && src.file != IMM)
290 return src;
291
292 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
293 return src;
294
295 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
296 expanded.type = src.type;
297 emit(MOV(expanded, src));
298 return src_reg(expanded);
299 }
300
301 src_reg
302 vec4_visitor::fix_math_operand(src_reg src)
303 {
304 /* The gen6 math instruction ignores the source modifiers --
305 * swizzle, abs, negate, and at least some parts of the register
306 * region description.
307 *
308 * Rather than trying to enumerate all these cases, *always* expand the
309 * operand to a temp GRF for gen6.
310 *
311 * For gen7, keep the operand as-is, except if immediate, which gen7 still
312 * can't use.
313 */
314
315 if (brw->gen == 7 && src.file != IMM)
316 return src;
317
318 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
319 expanded.type = src.type;
320 emit(MOV(expanded, src));
321 return src_reg(expanded);
322 }
323
324 void
325 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
326 {
327 src = fix_math_operand(src);
328
329 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
330 /* The gen6 math instruction must be align1, so we can't do
331 * writemasks.
332 */
333 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
334
335 emit(opcode, temp_dst, src);
336
337 emit(MOV(dst, src_reg(temp_dst)));
338 } else {
339 emit(opcode, dst, src);
340 }
341 }
342
343 void
344 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
345 {
346 vec4_instruction *inst = emit(opcode, dst, src);
347 inst->base_mrf = 1;
348 inst->mlen = 1;
349 }
350
351 void
352 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
353 {
354 switch (opcode) {
355 case SHADER_OPCODE_RCP:
356 case SHADER_OPCODE_RSQ:
357 case SHADER_OPCODE_SQRT:
358 case SHADER_OPCODE_EXP2:
359 case SHADER_OPCODE_LOG2:
360 case SHADER_OPCODE_SIN:
361 case SHADER_OPCODE_COS:
362 break;
363 default:
364 assert(!"not reached: bad math opcode");
365 return;
366 }
367
368 if (brw->gen >= 6) {
369 return emit_math1_gen6(opcode, dst, src);
370 } else {
371 return emit_math1_gen4(opcode, dst, src);
372 }
373 }
374
375 void
376 vec4_visitor::emit_math2_gen6(enum opcode opcode,
377 dst_reg dst, src_reg src0, src_reg src1)
378 {
379 src0 = fix_math_operand(src0);
380 src1 = fix_math_operand(src1);
381
382 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
383 /* The gen6 math instruction must be align1, so we can't do
384 * writemasks.
385 */
386 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
387 temp_dst.type = dst.type;
388
389 emit(opcode, temp_dst, src0, src1);
390
391 emit(MOV(dst, src_reg(temp_dst)));
392 } else {
393 emit(opcode, dst, src0, src1);
394 }
395 }
396
397 void
398 vec4_visitor::emit_math2_gen4(enum opcode opcode,
399 dst_reg dst, src_reg src0, src_reg src1)
400 {
401 vec4_instruction *inst = emit(opcode, dst, src0, src1);
402 inst->base_mrf = 1;
403 inst->mlen = 2;
404 }
405
406 void
407 vec4_visitor::emit_math(enum opcode opcode,
408 dst_reg dst, src_reg src0, src_reg src1)
409 {
410 switch (opcode) {
411 case SHADER_OPCODE_POW:
412 case SHADER_OPCODE_INT_QUOTIENT:
413 case SHADER_OPCODE_INT_REMAINDER:
414 break;
415 default:
416 assert(!"not reached: unsupported binary math opcode");
417 return;
418 }
419
420 if (brw->gen >= 6) {
421 return emit_math2_gen6(opcode, dst, src0, src1);
422 } else {
423 return emit_math2_gen4(opcode, dst, src0, src1);
424 }
425 }
426
427 void
428 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
429 {
430 if (brw->gen < 7)
431 assert(!"ir_unop_pack_half_2x16 should be lowered");
432
433 assert(dst.type == BRW_REGISTER_TYPE_UD);
434 assert(src0.type == BRW_REGISTER_TYPE_F);
435
436 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
437 *
438 * Because this instruction does not have a 16-bit floating-point type,
439 * the destination data type must be Word (W).
440 *
441 * The destination must be DWord-aligned and specify a horizontal stride
442 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
443 * each destination channel and the upper word is not modified.
444 *
445 * The above restriction implies that the f32to16 instruction must use
446 * align1 mode, because only in align1 mode is it possible to specify
447 * horizontal stride. We choose here to defy the hardware docs and emit
448 * align16 instructions.
449 *
450 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
451 * instructions. I was partially successful in that the code passed all
452 * tests. However, the code was dubiously correct and fragile, and the
453 * tests were not harsh enough to probe that frailty. Not trusting the
454 * code, I chose instead to remain in align16 mode in defiance of the hw
455 * docs).
456 *
457 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
458 * simulator, emitting a f32to16 in align16 mode with UD as destination
459 * data type is safe. The behavior differs from that specified in the PRM
460 * in that the upper word of each destination channel is cleared to 0.
461 */
462
463 dst_reg tmp_dst(this, glsl_type::uvec2_type);
464 src_reg tmp_src(tmp_dst);
465
466 #if 0
467 /* Verify the undocumented behavior on which the following instructions
468 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
469 * then the result of the bit-or instruction below will be incorrect.
470 *
471 * You should inspect the disasm output in order to verify that the MOV is
472 * not optimized away.
473 */
474 emit(MOV(tmp_dst, src_reg(0x12345678u)));
475 #endif
476
477 /* Give tmp the form below, where "." means untouched.
478 *
479 * w z y x w z y x
480 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
481 *
482 * That the upper word of each write-channel be 0 is required for the
483 * following bit-shift and bit-or instructions to work. Note that this
484 * relies on the undocumented hardware behavior mentioned above.
485 */
486 tmp_dst.writemask = WRITEMASK_XY;
487 emit(F32TO16(tmp_dst, src0));
488
489 /* Give the write-channels of dst the form:
490 * 0xhhhh0000
491 */
492 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
493 emit(SHL(dst, tmp_src, src_reg(16u)));
494
495 /* Finally, give the write-channels of dst the form of packHalf2x16's
496 * output:
497 * 0xhhhhllll
498 */
499 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
500 emit(OR(dst, src_reg(dst), tmp_src));
501 }
502
503 void
504 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
505 {
506 if (brw->gen < 7)
507 assert(!"ir_unop_unpack_half_2x16 should be lowered");
508
509 assert(dst.type == BRW_REGISTER_TYPE_F);
510 assert(src0.type == BRW_REGISTER_TYPE_UD);
511
512 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
513 *
514 * Because this instruction does not have a 16-bit floating-point type,
515 * the source data type must be Word (W). The destination type must be
516 * F (Float).
517 *
518 * To use W as the source data type, we must adjust horizontal strides,
519 * which is only possible in align1 mode. All my [chadv] attempts at
520 * emitting align1 instructions for unpackHalf2x16 failed to pass the
521 * Piglit tests, so I gave up.
522 *
523 * I've verified that, on gen7 hardware and the simulator, it is safe to
524 * emit f16to32 in align16 mode with UD as source data type.
525 */
526
527 dst_reg tmp_dst(this, glsl_type::uvec2_type);
528 src_reg tmp_src(tmp_dst);
529
530 tmp_dst.writemask = WRITEMASK_X;
531 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
532
533 tmp_dst.writemask = WRITEMASK_Y;
534 emit(SHR(tmp_dst, src0, src_reg(16u)));
535
536 dst.writemask = WRITEMASK_XY;
537 emit(F16TO32(dst, tmp_src));
538 }
539
540 void
541 vec4_visitor::visit_instructions(const exec_list *list)
542 {
543 foreach_list(node, list) {
544 ir_instruction *ir = (ir_instruction *)node;
545
546 base_ir = ir;
547 ir->accept(this);
548 }
549 }
550
551
552 static int
553 type_size(const struct glsl_type *type)
554 {
555 unsigned int i;
556 int size;
557
558 switch (type->base_type) {
559 case GLSL_TYPE_UINT:
560 case GLSL_TYPE_INT:
561 case GLSL_TYPE_FLOAT:
562 case GLSL_TYPE_BOOL:
563 if (type->is_matrix()) {
564 return type->matrix_columns;
565 } else {
566 /* Regardless of size of vector, it gets a vec4. This is bad
567 * packing for things like floats, but otherwise arrays become a
568 * mess. Hopefully a later pass over the code can pack scalars
569 * down if appropriate.
570 */
571 return 1;
572 }
573 case GLSL_TYPE_ARRAY:
574 assert(type->length > 0);
575 return type_size(type->fields.array) * type->length;
576 case GLSL_TYPE_STRUCT:
577 size = 0;
578 for (i = 0; i < type->length; i++) {
579 size += type_size(type->fields.structure[i].type);
580 }
581 return size;
582 case GLSL_TYPE_SAMPLER:
583 /* Samplers take up one slot in UNIFORMS[], but they're baked in
584 * at link time.
585 */
586 return 1;
587 case GLSL_TYPE_ATOMIC_UINT:
588 return 0;
589 case GLSL_TYPE_IMAGE:
590 case GLSL_TYPE_VOID:
591 case GLSL_TYPE_ERROR:
592 case GLSL_TYPE_INTERFACE:
593 assert(0);
594 break;
595 }
596
597 return 0;
598 }
599
600 int
601 vec4_visitor::virtual_grf_alloc(int size)
602 {
603 if (virtual_grf_array_size <= virtual_grf_count) {
604 if (virtual_grf_array_size == 0)
605 virtual_grf_array_size = 16;
606 else
607 virtual_grf_array_size *= 2;
608 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
609 virtual_grf_array_size);
610 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
611 virtual_grf_array_size);
612 }
613 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
614 virtual_grf_reg_count += size;
615 virtual_grf_sizes[virtual_grf_count] = size;
616 return virtual_grf_count++;
617 }
618
619 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
620 {
621 init();
622
623 this->file = GRF;
624 this->reg = v->virtual_grf_alloc(type_size(type));
625
626 if (type->is_array() || type->is_record()) {
627 this->swizzle = BRW_SWIZZLE_NOOP;
628 } else {
629 this->swizzle = swizzle_for_size(type->vector_elements);
630 }
631
632 this->type = brw_type_for_base_type(type);
633 }
634
635 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
636 {
637 init();
638
639 this->file = GRF;
640 this->reg = v->virtual_grf_alloc(type_size(type));
641
642 if (type->is_array() || type->is_record()) {
643 this->writemask = WRITEMASK_XYZW;
644 } else {
645 this->writemask = (1 << type->vector_elements) - 1;
646 }
647
648 this->type = brw_type_for_base_type(type);
649 }
650
651 /* Our support for uniforms is piggy-backed on the struct
652 * gl_fragment_program, because that's where the values actually
653 * get stored, rather than in some global gl_shader_program uniform
654 * store.
655 */
656 void
657 vec4_visitor::setup_uniform_values(ir_variable *ir)
658 {
659 int namelen = strlen(ir->name);
660
661 /* The data for our (non-builtin) uniforms is stored in a series of
662 * gl_uniform_driver_storage structs for each subcomponent that
663 * glGetUniformLocation() could name. We know it's been set up in the same
664 * order we'd walk the type, so walk the list of storage and find anything
665 * with our name, or the prefix of a component that starts with our name.
666 */
667 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
668 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
669
670 if (strncmp(ir->name, storage->name, namelen) != 0 ||
671 (storage->name[namelen] != 0 &&
672 storage->name[namelen] != '.' &&
673 storage->name[namelen] != '[')) {
674 continue;
675 }
676
677 gl_constant_value *components = storage->storage;
678 unsigned vector_count = (MAX2(storage->array_elements, 1) *
679 storage->type->matrix_columns);
680
681 for (unsigned s = 0; s < vector_count; s++) {
682 assert(uniforms < uniform_array_size);
683 uniform_vector_size[uniforms] = storage->type->vector_elements;
684
685 int i;
686 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
687 stage_prog_data->param[uniforms * 4 + i] = &components->f;
688 components++;
689 }
690 for (; i < 4; i++) {
691 static float zero = 0;
692 stage_prog_data->param[uniforms * 4 + i] = &zero;
693 }
694
695 uniforms++;
696 }
697 }
698 }
699
700 void
701 vec4_visitor::setup_uniform_clipplane_values()
702 {
703 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
704
705 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
706 assert(this->uniforms < uniform_array_size);
707 this->uniform_vector_size[this->uniforms] = 4;
708 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
709 this->userplane[i].type = BRW_REGISTER_TYPE_F;
710 for (int j = 0; j < 4; ++j) {
711 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
712 }
713 ++this->uniforms;
714 }
715 }
716
717 /* Our support for builtin uniforms is even scarier than non-builtin.
718 * It sits on top of the PROG_STATE_VAR parameters that are
719 * automatically updated from GL context state.
720 */
721 void
722 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
723 {
724 const ir_state_slot *const slots = ir->state_slots;
725 assert(ir->state_slots != NULL);
726
727 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
728 /* This state reference has already been setup by ir_to_mesa,
729 * but we'll get the same index back here. We can reference
730 * ParameterValues directly, since unlike brw_fs.cpp, we never
731 * add new state references during compile.
732 */
733 int index = _mesa_add_state_reference(this->prog->Parameters,
734 (gl_state_index *)slots[i].tokens);
735 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
736
737 assert(this->uniforms < uniform_array_size);
738 this->uniform_vector_size[this->uniforms] = 0;
739 /* Add each of the unique swizzled channels of the element.
740 * This will end up matching the size of the glsl_type of this field.
741 */
742 int last_swiz = -1;
743 for (unsigned int j = 0; j < 4; j++) {
744 int swiz = GET_SWZ(slots[i].swizzle, j);
745 last_swiz = swiz;
746
747 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
748 assert(this->uniforms < uniform_array_size);
749 if (swiz <= last_swiz)
750 this->uniform_vector_size[this->uniforms]++;
751 }
752 this->uniforms++;
753 }
754 }
755
756 dst_reg *
757 vec4_visitor::variable_storage(ir_variable *var)
758 {
759 return (dst_reg *)hash_table_find(this->variable_ht, var);
760 }
761
762 void
763 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
764 {
765 ir_expression *expr = ir->as_expression();
766
767 *predicate = BRW_PREDICATE_NORMAL;
768
769 if (expr) {
770 src_reg op[2];
771 vec4_instruction *inst;
772
773 assert(expr->get_num_operands() <= 2);
774 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
775 expr->operands[i]->accept(this);
776 op[i] = this->result;
777
778 resolve_ud_negate(&op[i]);
779 }
780
781 switch (expr->operation) {
782 case ir_unop_logic_not:
783 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
784 inst->conditional_mod = BRW_CONDITIONAL_Z;
785 break;
786
787 case ir_binop_logic_xor:
788 inst = emit(XOR(dst_null_d(), op[0], op[1]));
789 inst->conditional_mod = BRW_CONDITIONAL_NZ;
790 break;
791
792 case ir_binop_logic_or:
793 inst = emit(OR(dst_null_d(), op[0], op[1]));
794 inst->conditional_mod = BRW_CONDITIONAL_NZ;
795 break;
796
797 case ir_binop_logic_and:
798 inst = emit(AND(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_unop_f2b:
803 if (brw->gen >= 6) {
804 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
805 } else {
806 inst = emit(MOV(dst_null_f(), op[0]));
807 inst->conditional_mod = BRW_CONDITIONAL_NZ;
808 }
809 break;
810
811 case ir_unop_i2b:
812 if (brw->gen >= 6) {
813 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
814 } else {
815 inst = emit(MOV(dst_null_d(), op[0]));
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 }
818 break;
819
820 case ir_binop_all_equal:
821 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
822 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
823 break;
824
825 case ir_binop_any_nequal:
826 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
827 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
828 break;
829
830 case ir_unop_any:
831 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
832 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
833 break;
834
835 case ir_binop_greater:
836 case ir_binop_gequal:
837 case ir_binop_less:
838 case ir_binop_lequal:
839 case ir_binop_equal:
840 case ir_binop_nequal:
841 emit(CMP(dst_null_d(), op[0], op[1],
842 brw_conditional_for_comparison(expr->operation)));
843 break;
844
845 default:
846 assert(!"not reached");
847 break;
848 }
849 return;
850 }
851
852 ir->accept(this);
853
854 resolve_ud_negate(&this->result);
855
856 if (brw->gen >= 6) {
857 vec4_instruction *inst = emit(AND(dst_null_d(),
858 this->result, src_reg(1)));
859 inst->conditional_mod = BRW_CONDITIONAL_NZ;
860 } else {
861 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
862 inst->conditional_mod = BRW_CONDITIONAL_NZ;
863 }
864 }
865
866 /**
867 * Emit a gen6 IF statement with the comparison folded into the IF
868 * instruction.
869 */
870 void
871 vec4_visitor::emit_if_gen6(ir_if *ir)
872 {
873 ir_expression *expr = ir->condition->as_expression();
874
875 if (expr) {
876 src_reg op[2];
877 dst_reg temp;
878
879 assert(expr->get_num_operands() <= 2);
880 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
881 expr->operands[i]->accept(this);
882 op[i] = this->result;
883 }
884
885 switch (expr->operation) {
886 case ir_unop_logic_not:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
888 return;
889
890 case ir_binop_logic_xor:
891 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_binop_logic_or:
895 temp = dst_reg(this, glsl_type::bool_type);
896 emit(OR(temp, op[0], op[1]));
897 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
898 return;
899
900 case ir_binop_logic_and:
901 temp = dst_reg(this, glsl_type::bool_type);
902 emit(AND(temp, op[0], op[1]));
903 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
904 return;
905
906 case ir_unop_f2b:
907 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
908 return;
909
910 case ir_unop_i2b:
911 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
912 return;
913
914 case ir_binop_greater:
915 case ir_binop_gequal:
916 case ir_binop_less:
917 case ir_binop_lequal:
918 case ir_binop_equal:
919 case ir_binop_nequal:
920 emit(IF(op[0], op[1],
921 brw_conditional_for_comparison(expr->operation)));
922 return;
923
924 case ir_binop_all_equal:
925 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
926 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
927 return;
928
929 case ir_binop_any_nequal:
930 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
931 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
932 return;
933
934 case ir_unop_any:
935 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
936 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
937 return;
938
939 default:
940 assert(!"not reached");
941 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
942 return;
943 }
944 return;
945 }
946
947 ir->condition->accept(this);
948
949 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
950 }
951
952 void
953 vec4_visitor::visit(ir_variable *ir)
954 {
955 dst_reg *reg = NULL;
956
957 if (variable_storage(ir))
958 return;
959
960 switch (ir->data.mode) {
961 case ir_var_shader_in:
962 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
963 break;
964
965 case ir_var_shader_out:
966 reg = new(mem_ctx) dst_reg(this, ir->type);
967
968 for (int i = 0; i < type_size(ir->type); i++) {
969 output_reg[ir->data.location + i] = *reg;
970 output_reg[ir->data.location + i].reg_offset = i;
971 output_reg[ir->data.location + i].type =
972 brw_type_for_base_type(ir->type->get_scalar_type());
973 output_reg_annotation[ir->data.location + i] = ir->name;
974 }
975 break;
976
977 case ir_var_auto:
978 case ir_var_temporary:
979 reg = new(mem_ctx) dst_reg(this, ir->type);
980 break;
981
982 case ir_var_uniform:
983 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
984
985 /* Thanks to the lower_ubo_reference pass, we will see only
986 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
987 * variables, so no need for them to be in variable_ht.
988 *
989 * Atomic counters take no uniform storage, no need to do
990 * anything here.
991 */
992 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
993 return;
994
995 /* Track how big the whole uniform variable is, in case we need to put a
996 * copy of its data into pull constants for array access.
997 */
998 assert(this->uniforms < uniform_array_size);
999 this->uniform_size[this->uniforms] = type_size(ir->type);
1000
1001 if (!strncmp(ir->name, "gl_", 3)) {
1002 setup_builtin_uniform_values(ir);
1003 } else {
1004 setup_uniform_values(ir);
1005 }
1006 break;
1007
1008 case ir_var_system_value:
1009 reg = make_reg_for_system_value(ir);
1010 break;
1011
1012 default:
1013 assert(!"not reached");
1014 }
1015
1016 reg->type = brw_type_for_base_type(ir->type);
1017 hash_table_insert(this->variable_ht, reg, ir);
1018 }
1019
1020 void
1021 vec4_visitor::visit(ir_loop *ir)
1022 {
1023 /* We don't want debugging output to print the whole body of the
1024 * loop as the annotation.
1025 */
1026 this->base_ir = NULL;
1027
1028 emit(BRW_OPCODE_DO);
1029
1030 visit_instructions(&ir->body_instructions);
1031
1032 emit(BRW_OPCODE_WHILE);
1033 }
1034
1035 void
1036 vec4_visitor::visit(ir_loop_jump *ir)
1037 {
1038 switch (ir->mode) {
1039 case ir_loop_jump::jump_break:
1040 emit(BRW_OPCODE_BREAK);
1041 break;
1042 case ir_loop_jump::jump_continue:
1043 emit(BRW_OPCODE_CONTINUE);
1044 break;
1045 }
1046 }
1047
1048
1049 void
1050 vec4_visitor::visit(ir_function_signature *ir)
1051 {
1052 assert(0);
1053 (void)ir;
1054 }
1055
1056 void
1057 vec4_visitor::visit(ir_function *ir)
1058 {
1059 /* Ignore function bodies other than main() -- we shouldn't see calls to
1060 * them since they should all be inlined.
1061 */
1062 if (strcmp(ir->name, "main") == 0) {
1063 const ir_function_signature *sig;
1064 exec_list empty;
1065
1066 sig = ir->matching_signature(NULL, &empty);
1067
1068 assert(sig);
1069
1070 visit_instructions(&sig->body);
1071 }
1072 }
1073
1074 bool
1075 vec4_visitor::try_emit_sat(ir_expression *ir)
1076 {
1077 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1078 if (!sat_src)
1079 return false;
1080
1081 sat_src->accept(this);
1082 src_reg src = this->result;
1083
1084 this->result = src_reg(this, ir->type);
1085 vec4_instruction *inst;
1086 inst = emit(MOV(dst_reg(this->result), src));
1087 inst->saturate = true;
1088
1089 return true;
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_mad(ir_expression *ir)
1094 {
1095 /* 3-src instructions were introduced in gen6. */
1096 if (brw->gen < 6)
1097 return false;
1098
1099 /* MAD can only handle floating-point data. */
1100 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1101 return false;
1102
1103 ir_rvalue *nonmul = ir->operands[1];
1104 ir_expression *mul = ir->operands[0]->as_expression();
1105
1106 if (!mul || mul->operation != ir_binop_mul) {
1107 nonmul = ir->operands[0];
1108 mul = ir->operands[1]->as_expression();
1109
1110 if (!mul || mul->operation != ir_binop_mul)
1111 return false;
1112 }
1113
1114 nonmul->accept(this);
1115 src_reg src0 = fix_3src_operand(this->result);
1116
1117 mul->operands[0]->accept(this);
1118 src_reg src1 = fix_3src_operand(this->result);
1119
1120 mul->operands[1]->accept(this);
1121 src_reg src2 = fix_3src_operand(this->result);
1122
1123 this->result = src_reg(this, ir->type);
1124 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1125
1126 return true;
1127 }
1128
1129 void
1130 vec4_visitor::emit_bool_comparison(unsigned int op,
1131 dst_reg dst, src_reg src0, src_reg src1)
1132 {
1133 /* original gen4 does destination conversion before comparison. */
1134 if (brw->gen < 5)
1135 dst.type = src0.type;
1136
1137 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1138
1139 dst.type = BRW_REGISTER_TYPE_D;
1140 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1141 }
1142
1143 void
1144 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1145 src_reg src0, src_reg src1)
1146 {
1147 vec4_instruction *inst;
1148
1149 if (brw->gen >= 6) {
1150 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1151 inst->conditional_mod = conditionalmod;
1152 } else {
1153 emit(CMP(dst, src0, src1, conditionalmod));
1154
1155 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1156 inst->predicate = BRW_PREDICATE_NORMAL;
1157 }
1158 }
1159
1160 void
1161 vec4_visitor::emit_lrp(const dst_reg &dst,
1162 const src_reg &x, const src_reg &y, const src_reg &a)
1163 {
1164 if (brw->gen >= 6) {
1165 /* Note that the instruction's argument order is reversed from GLSL
1166 * and the IR.
1167 */
1168 emit(LRP(dst,
1169 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1170 } else {
1171 /* Earlier generations don't support three source operations, so we
1172 * need to emit x*(1-a) + y*a.
1173 */
1174 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1175 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1176 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1177 y_times_a.writemask = dst.writemask;
1178 one_minus_a.writemask = dst.writemask;
1179 x_times_one_minus_a.writemask = dst.writemask;
1180
1181 emit(MUL(y_times_a, y, a));
1182 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1183 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1184 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1185 }
1186 }
1187
1188 void
1189 vec4_visitor::visit(ir_expression *ir)
1190 {
1191 unsigned int operand;
1192 src_reg op[Elements(ir->operands)];
1193 src_reg result_src;
1194 dst_reg result_dst;
1195 vec4_instruction *inst;
1196
1197 if (try_emit_sat(ir))
1198 return;
1199
1200 if (ir->operation == ir_binop_add) {
1201 if (try_emit_mad(ir))
1202 return;
1203 }
1204
1205 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1206 this->result.file = BAD_FILE;
1207 ir->operands[operand]->accept(this);
1208 if (this->result.file == BAD_FILE) {
1209 fprintf(stderr, "Failed to get tree for expression operand:\n");
1210 ir->operands[operand]->fprint(stderr);
1211 exit(1);
1212 }
1213 op[operand] = this->result;
1214
1215 /* Matrix expression operands should have been broken down to vector
1216 * operations already.
1217 */
1218 assert(!ir->operands[operand]->type->is_matrix());
1219 }
1220
1221 int vector_elements = ir->operands[0]->type->vector_elements;
1222 if (ir->operands[1]) {
1223 vector_elements = MAX2(vector_elements,
1224 ir->operands[1]->type->vector_elements);
1225 }
1226
1227 this->result.file = BAD_FILE;
1228
1229 /* Storage for our result. Ideally for an assignment we'd be using
1230 * the actual storage for the result here, instead.
1231 */
1232 result_src = src_reg(this, ir->type);
1233 /* convenience for the emit functions below. */
1234 result_dst = dst_reg(result_src);
1235 /* If nothing special happens, this is the result. */
1236 this->result = result_src;
1237 /* Limit writes to the channels that will be used by result_src later.
1238 * This does limit this temp's use as a temporary for multi-instruction
1239 * sequences.
1240 */
1241 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1242
1243 switch (ir->operation) {
1244 case ir_unop_logic_not:
1245 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1246 * ones complement of the whole register, not just bit 0.
1247 */
1248 emit(XOR(result_dst, op[0], src_reg(1)));
1249 break;
1250 case ir_unop_neg:
1251 op[0].negate = !op[0].negate;
1252 emit(MOV(result_dst, op[0]));
1253 break;
1254 case ir_unop_abs:
1255 op[0].abs = true;
1256 op[0].negate = false;
1257 emit(MOV(result_dst, op[0]));
1258 break;
1259
1260 case ir_unop_sign:
1261 if (ir->type->is_float()) {
1262 /* AND(val, 0x80000000) gives the sign bit.
1263 *
1264 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1265 * zero.
1266 */
1267 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1268
1269 op[0].type = BRW_REGISTER_TYPE_UD;
1270 result_dst.type = BRW_REGISTER_TYPE_UD;
1271 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1272
1273 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1274 inst->predicate = BRW_PREDICATE_NORMAL;
1275
1276 this->result.type = BRW_REGISTER_TYPE_F;
1277 } else {
1278 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1279 * -> non-negative val generates 0x00000000.
1280 * Predicated OR sets 1 if val is positive.
1281 */
1282 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1283
1284 emit(ASR(result_dst, op[0], src_reg(31)));
1285
1286 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1287 inst->predicate = BRW_PREDICATE_NORMAL;
1288 }
1289 break;
1290
1291 case ir_unop_rcp:
1292 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1293 break;
1294
1295 case ir_unop_exp2:
1296 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1297 break;
1298 case ir_unop_log2:
1299 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1300 break;
1301 case ir_unop_exp:
1302 case ir_unop_log:
1303 assert(!"not reached: should be handled by ir_explog_to_explog2");
1304 break;
1305 case ir_unop_sin:
1306 case ir_unop_sin_reduced:
1307 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1308 break;
1309 case ir_unop_cos:
1310 case ir_unop_cos_reduced:
1311 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1312 break;
1313
1314 case ir_unop_dFdx:
1315 case ir_unop_dFdy:
1316 assert(!"derivatives not valid in vertex shader");
1317 break;
1318
1319 case ir_unop_bitfield_reverse:
1320 emit(BFREV(result_dst, op[0]));
1321 break;
1322 case ir_unop_bit_count:
1323 emit(CBIT(result_dst, op[0]));
1324 break;
1325 case ir_unop_find_msb: {
1326 src_reg temp = src_reg(this, glsl_type::uint_type);
1327
1328 inst = emit(FBH(dst_reg(temp), op[0]));
1329 inst->dst.writemask = WRITEMASK_XYZW;
1330
1331 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1332 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1333 * subtract the result from 31 to convert the MSB count into an LSB count.
1334 */
1335
1336 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1337 temp.swizzle = BRW_SWIZZLE_NOOP;
1338 emit(MOV(result_dst, temp));
1339
1340 src_reg src_tmp = src_reg(result_dst);
1341 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1342
1343 src_tmp.negate = true;
1344 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1345 inst->predicate = BRW_PREDICATE_NORMAL;
1346 break;
1347 }
1348 case ir_unop_find_lsb:
1349 emit(FBL(result_dst, op[0]));
1350 break;
1351
1352 case ir_unop_noise:
1353 assert(!"not reached: should be handled by lower_noise");
1354 break;
1355
1356 case ir_binop_add:
1357 emit(ADD(result_dst, op[0], op[1]));
1358 break;
1359 case ir_binop_sub:
1360 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1361 break;
1362
1363 case ir_binop_mul:
1364 if (brw->gen < 8 && ir->type->is_integer()) {
1365 /* For integer multiplication, the MUL uses the low 16 bits of one of
1366 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1367 * accumulates in the contribution of the upper 16 bits of that
1368 * operand. If we can determine that one of the args is in the low
1369 * 16 bits, though, we can just emit a single MUL.
1370 */
1371 if (ir->operands[0]->is_uint16_constant()) {
1372 if (brw->gen < 7)
1373 emit(MUL(result_dst, op[0], op[1]));
1374 else
1375 emit(MUL(result_dst, op[1], op[0]));
1376 } else if (ir->operands[1]->is_uint16_constant()) {
1377 if (brw->gen < 7)
1378 emit(MUL(result_dst, op[1], op[0]));
1379 else
1380 emit(MUL(result_dst, op[0], op[1]));
1381 } else {
1382 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1383
1384 emit(MUL(acc, op[0], op[1]));
1385 emit(MACH(dst_null_d(), op[0], op[1]));
1386 emit(MOV(result_dst, src_reg(acc)));
1387 }
1388 } else {
1389 emit(MUL(result_dst, op[0], op[1]));
1390 }
1391 break;
1392 case ir_binop_imul_high: {
1393 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1394
1395 emit(MUL(acc, op[0], op[1]));
1396 emit(MACH(result_dst, op[0], op[1]));
1397 break;
1398 }
1399 case ir_binop_div:
1400 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1401 assert(ir->type->is_integer());
1402 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1403 break;
1404 case ir_binop_carry: {
1405 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1406
1407 emit(ADDC(dst_null_ud(), op[0], op[1]));
1408 emit(MOV(result_dst, src_reg(acc)));
1409 break;
1410 }
1411 case ir_binop_borrow: {
1412 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1413
1414 emit(SUBB(dst_null_ud(), op[0], op[1]));
1415 emit(MOV(result_dst, src_reg(acc)));
1416 break;
1417 }
1418 case ir_binop_mod:
1419 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1420 assert(ir->type->is_integer());
1421 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1422 break;
1423
1424 case ir_binop_less:
1425 case ir_binop_greater:
1426 case ir_binop_lequal:
1427 case ir_binop_gequal:
1428 case ir_binop_equal:
1429 case ir_binop_nequal: {
1430 emit(CMP(result_dst, op[0], op[1],
1431 brw_conditional_for_comparison(ir->operation)));
1432 emit(AND(result_dst, result_src, src_reg(0x1)));
1433 break;
1434 }
1435
1436 case ir_binop_all_equal:
1437 /* "==" operator producing a scalar boolean. */
1438 if (ir->operands[0]->type->is_vector() ||
1439 ir->operands[1]->type->is_vector()) {
1440 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1441 emit(MOV(result_dst, src_reg(0)));
1442 inst = emit(MOV(result_dst, src_reg(1)));
1443 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1444 } else {
1445 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1446 emit(AND(result_dst, result_src, src_reg(0x1)));
1447 }
1448 break;
1449 case ir_binop_any_nequal:
1450 /* "!=" operator producing a scalar boolean. */
1451 if (ir->operands[0]->type->is_vector() ||
1452 ir->operands[1]->type->is_vector()) {
1453 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1454
1455 emit(MOV(result_dst, src_reg(0)));
1456 inst = emit(MOV(result_dst, src_reg(1)));
1457 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1458 } else {
1459 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1460 emit(AND(result_dst, result_src, src_reg(0x1)));
1461 }
1462 break;
1463
1464 case ir_unop_any:
1465 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1466 emit(MOV(result_dst, src_reg(0)));
1467
1468 inst = emit(MOV(result_dst, src_reg(1)));
1469 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1470 break;
1471
1472 case ir_binop_logic_xor:
1473 emit(XOR(result_dst, op[0], op[1]));
1474 break;
1475
1476 case ir_binop_logic_or:
1477 emit(OR(result_dst, op[0], op[1]));
1478 break;
1479
1480 case ir_binop_logic_and:
1481 emit(AND(result_dst, op[0], op[1]));
1482 break;
1483
1484 case ir_binop_dot:
1485 assert(ir->operands[0]->type->is_vector());
1486 assert(ir->operands[0]->type == ir->operands[1]->type);
1487 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1488 break;
1489
1490 case ir_unop_sqrt:
1491 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1492 break;
1493 case ir_unop_rsq:
1494 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1495 break;
1496
1497 case ir_unop_bitcast_i2f:
1498 case ir_unop_bitcast_u2f:
1499 this->result = op[0];
1500 this->result.type = BRW_REGISTER_TYPE_F;
1501 break;
1502
1503 case ir_unop_bitcast_f2i:
1504 this->result = op[0];
1505 this->result.type = BRW_REGISTER_TYPE_D;
1506 break;
1507
1508 case ir_unop_bitcast_f2u:
1509 this->result = op[0];
1510 this->result.type = BRW_REGISTER_TYPE_UD;
1511 break;
1512
1513 case ir_unop_i2f:
1514 case ir_unop_i2u:
1515 case ir_unop_u2i:
1516 case ir_unop_u2f:
1517 case ir_unop_b2f:
1518 case ir_unop_b2i:
1519 case ir_unop_f2i:
1520 case ir_unop_f2u:
1521 emit(MOV(result_dst, op[0]));
1522 break;
1523 case ir_unop_f2b:
1524 case ir_unop_i2b: {
1525 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1526 emit(AND(result_dst, result_src, src_reg(1)));
1527 break;
1528 }
1529
1530 case ir_unop_trunc:
1531 emit(RNDZ(result_dst, op[0]));
1532 break;
1533 case ir_unop_ceil:
1534 op[0].negate = !op[0].negate;
1535 inst = emit(RNDD(result_dst, op[0]));
1536 this->result.negate = true;
1537 break;
1538 case ir_unop_floor:
1539 inst = emit(RNDD(result_dst, op[0]));
1540 break;
1541 case ir_unop_fract:
1542 inst = emit(FRC(result_dst, op[0]));
1543 break;
1544 case ir_unop_round_even:
1545 emit(RNDE(result_dst, op[0]));
1546 break;
1547
1548 case ir_binop_min:
1549 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1550 break;
1551 case ir_binop_max:
1552 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1553 break;
1554
1555 case ir_binop_pow:
1556 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1557 break;
1558
1559 case ir_unop_bit_not:
1560 inst = emit(NOT(result_dst, op[0]));
1561 break;
1562 case ir_binop_bit_and:
1563 inst = emit(AND(result_dst, op[0], op[1]));
1564 break;
1565 case ir_binop_bit_xor:
1566 inst = emit(XOR(result_dst, op[0], op[1]));
1567 break;
1568 case ir_binop_bit_or:
1569 inst = emit(OR(result_dst, op[0], op[1]));
1570 break;
1571
1572 case ir_binop_lshift:
1573 inst = emit(SHL(result_dst, op[0], op[1]));
1574 break;
1575
1576 case ir_binop_rshift:
1577 if (ir->type->base_type == GLSL_TYPE_INT)
1578 inst = emit(ASR(result_dst, op[0], op[1]));
1579 else
1580 inst = emit(SHR(result_dst, op[0], op[1]));
1581 break;
1582
1583 case ir_binop_bfm:
1584 emit(BFI1(result_dst, op[0], op[1]));
1585 break;
1586
1587 case ir_binop_ubo_load: {
1588 ir_constant *uniform_block = ir->operands[0]->as_constant();
1589 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1590 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1591 src_reg offset;
1592
1593 /* Now, load the vector from that offset. */
1594 assert(ir->type->is_vector() || ir->type->is_scalar());
1595
1596 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1597 packed_consts.type = result.type;
1598 src_reg surf_index =
1599 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1600 if (const_offset_ir) {
1601 if (brw->gen >= 8) {
1602 /* Store the offset in a GRF so we can send-from-GRF. */
1603 offset = src_reg(this, glsl_type::int_type);
1604 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1605 } else {
1606 /* Immediates are fine on older generations since they'll be moved
1607 * to a (potentially fake) MRF at the generator level.
1608 */
1609 offset = src_reg(const_offset / 16);
1610 }
1611 } else {
1612 offset = src_reg(this, glsl_type::uint_type);
1613 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1614 }
1615
1616 if (brw->gen >= 7) {
1617 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1618 grf_offset.type = offset.type;
1619
1620 emit(MOV(grf_offset, offset));
1621
1622 emit(new(mem_ctx) vec4_instruction(this,
1623 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1624 dst_reg(packed_consts),
1625 surf_index,
1626 src_reg(grf_offset)));
1627 } else {
1628 vec4_instruction *pull =
1629 emit(new(mem_ctx) vec4_instruction(this,
1630 VS_OPCODE_PULL_CONSTANT_LOAD,
1631 dst_reg(packed_consts),
1632 surf_index,
1633 offset));
1634 pull->base_mrf = 14;
1635 pull->mlen = 1;
1636 }
1637
1638 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1639 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1640 const_offset % 16 / 4,
1641 const_offset % 16 / 4,
1642 const_offset % 16 / 4);
1643
1644 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1645 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1646 emit(CMP(result_dst, packed_consts, src_reg(0u),
1647 BRW_CONDITIONAL_NZ));
1648 emit(AND(result_dst, result, src_reg(0x1)));
1649 } else {
1650 emit(MOV(result_dst, packed_consts));
1651 }
1652 break;
1653 }
1654
1655 case ir_binop_vector_extract:
1656 assert(!"should have been lowered by vec_index_to_cond_assign");
1657 break;
1658
1659 case ir_triop_fma:
1660 op[0] = fix_3src_operand(op[0]);
1661 op[1] = fix_3src_operand(op[1]);
1662 op[2] = fix_3src_operand(op[2]);
1663 /* Note that the instruction's argument order is reversed from GLSL
1664 * and the IR.
1665 */
1666 emit(MAD(result_dst, op[2], op[1], op[0]));
1667 break;
1668
1669 case ir_triop_lrp:
1670 emit_lrp(result_dst, op[0], op[1], op[2]);
1671 break;
1672
1673 case ir_triop_csel:
1674 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1675 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1676 inst->predicate = BRW_PREDICATE_NORMAL;
1677 break;
1678
1679 case ir_triop_bfi:
1680 op[0] = fix_3src_operand(op[0]);
1681 op[1] = fix_3src_operand(op[1]);
1682 op[2] = fix_3src_operand(op[2]);
1683 emit(BFI2(result_dst, op[0], op[1], op[2]));
1684 break;
1685
1686 case ir_triop_bitfield_extract:
1687 op[0] = fix_3src_operand(op[0]);
1688 op[1] = fix_3src_operand(op[1]);
1689 op[2] = fix_3src_operand(op[2]);
1690 /* Note that the instruction's argument order is reversed from GLSL
1691 * and the IR.
1692 */
1693 emit(BFE(result_dst, op[2], op[1], op[0]));
1694 break;
1695
1696 case ir_triop_vector_insert:
1697 assert(!"should have been lowered by lower_vector_insert");
1698 break;
1699
1700 case ir_quadop_bitfield_insert:
1701 assert(!"not reached: should be handled by "
1702 "bitfield_insert_to_bfm_bfi\n");
1703 break;
1704
1705 case ir_quadop_vector:
1706 assert(!"not reached: should be handled by lower_quadop_vector");
1707 break;
1708
1709 case ir_unop_pack_half_2x16:
1710 emit_pack_half_2x16(result_dst, op[0]);
1711 break;
1712 case ir_unop_unpack_half_2x16:
1713 emit_unpack_half_2x16(result_dst, op[0]);
1714 break;
1715 case ir_unop_pack_snorm_2x16:
1716 case ir_unop_pack_snorm_4x8:
1717 case ir_unop_pack_unorm_2x16:
1718 case ir_unop_pack_unorm_4x8:
1719 case ir_unop_unpack_snorm_2x16:
1720 case ir_unop_unpack_snorm_4x8:
1721 case ir_unop_unpack_unorm_2x16:
1722 case ir_unop_unpack_unorm_4x8:
1723 assert(!"not reached: should be handled by lower_packing_builtins");
1724 break;
1725 case ir_unop_unpack_half_2x16_split_x:
1726 case ir_unop_unpack_half_2x16_split_y:
1727 case ir_binop_pack_half_2x16_split:
1728 assert(!"not reached: should not occur in vertex shader");
1729 break;
1730 case ir_binop_ldexp:
1731 assert(!"not reached: should be handled by ldexp_to_arith()");
1732 break;
1733 }
1734 }
1735
1736
1737 void
1738 vec4_visitor::visit(ir_swizzle *ir)
1739 {
1740 src_reg src;
1741 int i = 0;
1742 int swizzle[4];
1743
1744 /* Note that this is only swizzles in expressions, not those on the left
1745 * hand side of an assignment, which do write masking. See ir_assignment
1746 * for that.
1747 */
1748
1749 ir->val->accept(this);
1750 src = this->result;
1751 assert(src.file != BAD_FILE);
1752
1753 for (i = 0; i < ir->type->vector_elements; i++) {
1754 switch (i) {
1755 case 0:
1756 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1757 break;
1758 case 1:
1759 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1760 break;
1761 case 2:
1762 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1763 break;
1764 case 3:
1765 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1766 break;
1767 }
1768 }
1769 for (; i < 4; i++) {
1770 /* Replicate the last channel out. */
1771 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1772 }
1773
1774 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1775
1776 this->result = src;
1777 }
1778
1779 void
1780 vec4_visitor::visit(ir_dereference_variable *ir)
1781 {
1782 const struct glsl_type *type = ir->type;
1783 dst_reg *reg = variable_storage(ir->var);
1784
1785 if (!reg) {
1786 fail("Failed to find variable storage for %s\n", ir->var->name);
1787 this->result = src_reg(brw_null_reg());
1788 return;
1789 }
1790
1791 this->result = src_reg(*reg);
1792
1793 /* System values get their swizzle from the dst_reg writemask */
1794 if (ir->var->data.mode == ir_var_system_value)
1795 return;
1796
1797 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1798 this->result.swizzle = swizzle_for_size(type->vector_elements);
1799 }
1800
1801
1802 int
1803 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1804 {
1805 /* Under normal circumstances array elements are stored consecutively, so
1806 * the stride is equal to the size of the array element.
1807 */
1808 return type_size(ir->type);
1809 }
1810
1811
1812 void
1813 vec4_visitor::visit(ir_dereference_array *ir)
1814 {
1815 ir_constant *constant_index;
1816 src_reg src;
1817 int array_stride = compute_array_stride(ir);
1818
1819 constant_index = ir->array_index->constant_expression_value();
1820
1821 ir->array->accept(this);
1822 src = this->result;
1823
1824 if (constant_index) {
1825 src.reg_offset += constant_index->value.i[0] * array_stride;
1826 } else {
1827 /* Variable index array dereference. It eats the "vec4" of the
1828 * base of the array and an index that offsets the Mesa register
1829 * index.
1830 */
1831 ir->array_index->accept(this);
1832
1833 src_reg index_reg;
1834
1835 if (array_stride == 1) {
1836 index_reg = this->result;
1837 } else {
1838 index_reg = src_reg(this, glsl_type::int_type);
1839
1840 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1841 }
1842
1843 if (src.reladdr) {
1844 src_reg temp = src_reg(this, glsl_type::int_type);
1845
1846 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1847
1848 index_reg = temp;
1849 }
1850
1851 src.reladdr = ralloc(mem_ctx, src_reg);
1852 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1853 }
1854
1855 /* If the type is smaller than a vec4, replicate the last channel out. */
1856 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1857 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1858 else
1859 src.swizzle = BRW_SWIZZLE_NOOP;
1860 src.type = brw_type_for_base_type(ir->type);
1861
1862 this->result = src;
1863 }
1864
1865 void
1866 vec4_visitor::visit(ir_dereference_record *ir)
1867 {
1868 unsigned int i;
1869 const glsl_type *struct_type = ir->record->type;
1870 int offset = 0;
1871
1872 ir->record->accept(this);
1873
1874 for (i = 0; i < struct_type->length; i++) {
1875 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1876 break;
1877 offset += type_size(struct_type->fields.structure[i].type);
1878 }
1879
1880 /* If the type is smaller than a vec4, replicate the last channel out. */
1881 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1882 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1883 else
1884 this->result.swizzle = BRW_SWIZZLE_NOOP;
1885 this->result.type = brw_type_for_base_type(ir->type);
1886
1887 this->result.reg_offset += offset;
1888 }
1889
1890 /**
1891 * We want to be careful in assignment setup to hit the actual storage
1892 * instead of potentially using a temporary like we might with the
1893 * ir_dereference handler.
1894 */
1895 static dst_reg
1896 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1897 {
1898 /* The LHS must be a dereference. If the LHS is a variable indexed array
1899 * access of a vector, it must be separated into a series conditional moves
1900 * before reaching this point (see ir_vec_index_to_cond_assign).
1901 */
1902 assert(ir->as_dereference());
1903 ir_dereference_array *deref_array = ir->as_dereference_array();
1904 if (deref_array) {
1905 assert(!deref_array->array->type->is_vector());
1906 }
1907
1908 /* Use the rvalue deref handler for the most part. We'll ignore
1909 * swizzles in it and write swizzles using writemask, though.
1910 */
1911 ir->accept(v);
1912 return dst_reg(v->result);
1913 }
1914
1915 void
1916 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1917 const struct glsl_type *type, uint32_t predicate)
1918 {
1919 if (type->base_type == GLSL_TYPE_STRUCT) {
1920 for (unsigned int i = 0; i < type->length; i++) {
1921 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1922 }
1923 return;
1924 }
1925
1926 if (type->is_array()) {
1927 for (unsigned int i = 0; i < type->length; i++) {
1928 emit_block_move(dst, src, type->fields.array, predicate);
1929 }
1930 return;
1931 }
1932
1933 if (type->is_matrix()) {
1934 const struct glsl_type *vec_type;
1935
1936 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1937 type->vector_elements, 1);
1938
1939 for (int i = 0; i < type->matrix_columns; i++) {
1940 emit_block_move(dst, src, vec_type, predicate);
1941 }
1942 return;
1943 }
1944
1945 assert(type->is_scalar() || type->is_vector());
1946
1947 dst->type = brw_type_for_base_type(type);
1948 src->type = dst->type;
1949
1950 dst->writemask = (1 << type->vector_elements) - 1;
1951
1952 src->swizzle = swizzle_for_size(type->vector_elements);
1953
1954 vec4_instruction *inst = emit(MOV(*dst, *src));
1955 inst->predicate = predicate;
1956
1957 dst->reg_offset++;
1958 src->reg_offset++;
1959 }
1960
1961
1962 /* If the RHS processing resulted in an instruction generating a
1963 * temporary value, and it would be easy to rewrite the instruction to
1964 * generate its result right into the LHS instead, do so. This ends
1965 * up reliably removing instructions where it can be tricky to do so
1966 * later without real UD chain information.
1967 */
1968 bool
1969 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1970 dst_reg dst,
1971 src_reg src,
1972 vec4_instruction *pre_rhs_inst,
1973 vec4_instruction *last_rhs_inst)
1974 {
1975 /* This could be supported, but it would take more smarts. */
1976 if (ir->condition)
1977 return false;
1978
1979 if (pre_rhs_inst == last_rhs_inst)
1980 return false; /* No instructions generated to work with. */
1981
1982 /* Make sure the last instruction generated our source reg. */
1983 if (src.file != GRF ||
1984 src.file != last_rhs_inst->dst.file ||
1985 src.reg != last_rhs_inst->dst.reg ||
1986 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1987 src.reladdr ||
1988 src.abs ||
1989 src.negate ||
1990 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1991 return false;
1992
1993 /* Check that that last instruction fully initialized the channels
1994 * we want to use, in the order we want to use them. We could
1995 * potentially reswizzle the operands of many instructions so that
1996 * we could handle out of order channels, but don't yet.
1997 */
1998
1999 for (unsigned i = 0; i < 4; i++) {
2000 if (dst.writemask & (1 << i)) {
2001 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2002 return false;
2003
2004 if (BRW_GET_SWZ(src.swizzle, i) != i)
2005 return false;
2006 }
2007 }
2008
2009 /* Success! Rewrite the instruction. */
2010 last_rhs_inst->dst.file = dst.file;
2011 last_rhs_inst->dst.reg = dst.reg;
2012 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2013 last_rhs_inst->dst.reladdr = dst.reladdr;
2014 last_rhs_inst->dst.writemask &= dst.writemask;
2015
2016 return true;
2017 }
2018
2019 void
2020 vec4_visitor::visit(ir_assignment *ir)
2021 {
2022 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2023 uint32_t predicate = BRW_PREDICATE_NONE;
2024
2025 if (!ir->lhs->type->is_scalar() &&
2026 !ir->lhs->type->is_vector()) {
2027 ir->rhs->accept(this);
2028 src_reg src = this->result;
2029
2030 if (ir->condition) {
2031 emit_bool_to_cond_code(ir->condition, &predicate);
2032 }
2033
2034 /* emit_block_move doesn't account for swizzles in the source register.
2035 * This should be ok, since the source register is a structure or an
2036 * array, and those can't be swizzled. But double-check to be sure.
2037 */
2038 assert(src.swizzle ==
2039 (ir->rhs->type->is_matrix()
2040 ? swizzle_for_size(ir->rhs->type->vector_elements)
2041 : BRW_SWIZZLE_NOOP));
2042
2043 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2044 return;
2045 }
2046
2047 /* Now we're down to just a scalar/vector with writemasks. */
2048 int i;
2049
2050 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2051 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2052
2053 ir->rhs->accept(this);
2054
2055 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2056
2057 src_reg src = this->result;
2058
2059 int swizzles[4];
2060 int first_enabled_chan = 0;
2061 int src_chan = 0;
2062
2063 assert(ir->lhs->type->is_vector() ||
2064 ir->lhs->type->is_scalar());
2065 dst.writemask = ir->write_mask;
2066
2067 for (int i = 0; i < 4; i++) {
2068 if (dst.writemask & (1 << i)) {
2069 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2070 break;
2071 }
2072 }
2073
2074 /* Swizzle a small RHS vector into the channels being written.
2075 *
2076 * glsl ir treats write_mask as dictating how many channels are
2077 * present on the RHS while in our instructions we need to make
2078 * those channels appear in the slots of the vec4 they're written to.
2079 */
2080 for (int i = 0; i < 4; i++) {
2081 if (dst.writemask & (1 << i))
2082 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2083 else
2084 swizzles[i] = first_enabled_chan;
2085 }
2086 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2087 swizzles[2], swizzles[3]);
2088
2089 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2090 return;
2091 }
2092
2093 if (ir->condition) {
2094 emit_bool_to_cond_code(ir->condition, &predicate);
2095 }
2096
2097 for (i = 0; i < type_size(ir->lhs->type); i++) {
2098 vec4_instruction *inst = emit(MOV(dst, src));
2099 inst->predicate = predicate;
2100
2101 dst.reg_offset++;
2102 src.reg_offset++;
2103 }
2104 }
2105
2106 void
2107 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2108 {
2109 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2110 foreach_list(node, &ir->components) {
2111 ir_constant *field_value = (ir_constant *)node;
2112
2113 emit_constant_values(dst, field_value);
2114 }
2115 return;
2116 }
2117
2118 if (ir->type->is_array()) {
2119 for (unsigned int i = 0; i < ir->type->length; i++) {
2120 emit_constant_values(dst, ir->array_elements[i]);
2121 }
2122 return;
2123 }
2124
2125 if (ir->type->is_matrix()) {
2126 for (int i = 0; i < ir->type->matrix_columns; i++) {
2127 float *vec = &ir->value.f[i * ir->type->vector_elements];
2128
2129 for (int j = 0; j < ir->type->vector_elements; j++) {
2130 dst->writemask = 1 << j;
2131 dst->type = BRW_REGISTER_TYPE_F;
2132
2133 emit(MOV(*dst, src_reg(vec[j])));
2134 }
2135 dst->reg_offset++;
2136 }
2137 return;
2138 }
2139
2140 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2141
2142 for (int i = 0; i < ir->type->vector_elements; i++) {
2143 if (!(remaining_writemask & (1 << i)))
2144 continue;
2145
2146 dst->writemask = 1 << i;
2147 dst->type = brw_type_for_base_type(ir->type);
2148
2149 /* Find other components that match the one we're about to
2150 * write. Emits fewer instructions for things like vec4(0.5,
2151 * 1.5, 1.5, 1.5).
2152 */
2153 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2154 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2155 if (ir->value.b[i] == ir->value.b[j])
2156 dst->writemask |= (1 << j);
2157 } else {
2158 /* u, i, and f storage all line up, so no need for a
2159 * switch case for comparing each type.
2160 */
2161 if (ir->value.u[i] == ir->value.u[j])
2162 dst->writemask |= (1 << j);
2163 }
2164 }
2165
2166 switch (ir->type->base_type) {
2167 case GLSL_TYPE_FLOAT:
2168 emit(MOV(*dst, src_reg(ir->value.f[i])));
2169 break;
2170 case GLSL_TYPE_INT:
2171 emit(MOV(*dst, src_reg(ir->value.i[i])));
2172 break;
2173 case GLSL_TYPE_UINT:
2174 emit(MOV(*dst, src_reg(ir->value.u[i])));
2175 break;
2176 case GLSL_TYPE_BOOL:
2177 emit(MOV(*dst, src_reg(ir->value.b[i])));
2178 break;
2179 default:
2180 assert(!"Non-float/uint/int/bool constant");
2181 break;
2182 }
2183
2184 remaining_writemask &= ~dst->writemask;
2185 }
2186 dst->reg_offset++;
2187 }
2188
2189 void
2190 vec4_visitor::visit(ir_constant *ir)
2191 {
2192 dst_reg dst = dst_reg(this, ir->type);
2193 this->result = src_reg(dst);
2194
2195 emit_constant_values(&dst, ir);
2196 }
2197
2198 void
2199 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2200 {
2201 ir_dereference *deref = static_cast<ir_dereference *>(
2202 ir->actual_parameters.get_head());
2203 ir_variable *location = deref->variable_referenced();
2204 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2205 location->data.atomic.buffer_index);
2206
2207 /* Calculate the surface offset */
2208 src_reg offset(this, glsl_type::uint_type);
2209 ir_dereference_array *deref_array = deref->as_dereference_array();
2210 if (deref_array) {
2211 deref_array->array_index->accept(this);
2212
2213 src_reg tmp(this, glsl_type::uint_type);
2214 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2215 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2216 } else {
2217 offset = location->data.atomic.offset;
2218 }
2219
2220 /* Emit the appropriate machine instruction */
2221 const char *callee = ir->callee->function_name();
2222 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2223
2224 if (!strcmp("__intrinsic_atomic_read", callee)) {
2225 emit_untyped_surface_read(surf_index, dst, offset);
2226
2227 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2228 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2229 src_reg(), src_reg());
2230
2231 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2232 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2233 src_reg(), src_reg());
2234 }
2235 }
2236
2237 void
2238 vec4_visitor::visit(ir_call *ir)
2239 {
2240 const char *callee = ir->callee->function_name();
2241
2242 if (!strcmp("__intrinsic_atomic_read", callee) ||
2243 !strcmp("__intrinsic_atomic_increment", callee) ||
2244 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2245 visit_atomic_counter_intrinsic(ir);
2246 } else {
2247 assert(!"Unsupported intrinsic.");
2248 }
2249 }
2250
2251 src_reg
2252 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2253 {
2254 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2255 inst->base_mrf = 2;
2256 inst->mlen = 1;
2257 inst->sampler = sampler;
2258 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2259 inst->dst.writemask = WRITEMASK_XYZW;
2260
2261 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2262 int param_base = inst->base_mrf;
2263 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2264 int zero_mask = 0xf & ~coord_mask;
2265
2266 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2267 coordinate));
2268
2269 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2270 src_reg(0)));
2271
2272 emit(inst);
2273 return src_reg(inst->dst);
2274 }
2275
2276 void
2277 vec4_visitor::visit(ir_texture *ir)
2278 {
2279 int sampler =
2280 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2281
2282 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2283 * emitting anything other than setting up the constant result.
2284 */
2285 if (ir->op == ir_tg4) {
2286 ir_constant *chan = ir->lod_info.component->as_constant();
2287 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2288 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2289 dst_reg result(this, ir->type);
2290 this->result = src_reg(result);
2291 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2292 return;
2293 }
2294 }
2295
2296 /* Should be lowered by do_lower_texture_projection */
2297 assert(!ir->projector);
2298
2299 /* Should be lowered */
2300 assert(!ir->offset || !ir->offset->type->is_array());
2301
2302 /* Generate code to compute all the subexpression trees. This has to be
2303 * done before loading any values into MRFs for the sampler message since
2304 * generating these values may involve SEND messages that need the MRFs.
2305 */
2306 src_reg coordinate;
2307 if (ir->coordinate) {
2308 ir->coordinate->accept(this);
2309 coordinate = this->result;
2310 }
2311
2312 src_reg shadow_comparitor;
2313 if (ir->shadow_comparitor) {
2314 ir->shadow_comparitor->accept(this);
2315 shadow_comparitor = this->result;
2316 }
2317
2318 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2319 src_reg offset_value;
2320 if (has_nonconstant_offset) {
2321 ir->offset->accept(this);
2322 offset_value = src_reg(this->result);
2323 }
2324
2325 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2326 src_reg lod, dPdx, dPdy, sample_index, mcs;
2327 switch (ir->op) {
2328 case ir_tex:
2329 lod = src_reg(0.0f);
2330 lod_type = glsl_type::float_type;
2331 break;
2332 case ir_txf:
2333 case ir_txl:
2334 case ir_txs:
2335 ir->lod_info.lod->accept(this);
2336 lod = this->result;
2337 lod_type = ir->lod_info.lod->type;
2338 break;
2339 case ir_query_levels:
2340 lod = src_reg(0);
2341 lod_type = glsl_type::int_type;
2342 break;
2343 case ir_txf_ms:
2344 ir->lod_info.sample_index->accept(this);
2345 sample_index = this->result;
2346 sample_index_type = ir->lod_info.sample_index->type;
2347
2348 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2349 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2350 else
2351 mcs = src_reg(0u);
2352 break;
2353 case ir_txd:
2354 ir->lod_info.grad.dPdx->accept(this);
2355 dPdx = this->result;
2356
2357 ir->lod_info.grad.dPdy->accept(this);
2358 dPdy = this->result;
2359
2360 lod_type = ir->lod_info.grad.dPdx->type;
2361 break;
2362 case ir_txb:
2363 case ir_lod:
2364 case ir_tg4:
2365 break;
2366 }
2367
2368 vec4_instruction *inst = NULL;
2369 switch (ir->op) {
2370 case ir_tex:
2371 case ir_txl:
2372 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2373 break;
2374 case ir_txd:
2375 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2376 break;
2377 case ir_txf:
2378 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2379 break;
2380 case ir_txf_ms:
2381 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2382 break;
2383 case ir_txs:
2384 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2385 break;
2386 case ir_tg4:
2387 if (has_nonconstant_offset)
2388 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2389 else
2390 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2391 break;
2392 case ir_query_levels:
2393 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2394 break;
2395 case ir_txb:
2396 assert(!"TXB is not valid for vertex shaders.");
2397 break;
2398 case ir_lod:
2399 assert(!"LOD is not valid for vertex shaders.");
2400 break;
2401 default:
2402 assert(!"Unrecognized tex op");
2403 }
2404
2405 if (ir->offset != NULL && ir->op != ir_txf)
2406 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2407
2408 /* Stuff the channel select bits in the top of the texture offset */
2409 if (ir->op == ir_tg4)
2410 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2411
2412 /* The message header is necessary for:
2413 * - Gen4 (always)
2414 * - Texel offsets
2415 * - Gather channel selection
2416 * - Sampler indices too large to fit in a 4-bit value.
2417 */
2418 inst->header_present =
2419 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2420 sampler >= 16;
2421 inst->base_mrf = 2;
2422 inst->mlen = inst->header_present + 1; /* always at least one */
2423 inst->sampler = sampler;
2424 inst->dst = dst_reg(this, ir->type);
2425 inst->dst.writemask = WRITEMASK_XYZW;
2426 inst->shadow_compare = ir->shadow_comparitor != NULL;
2427
2428 /* MRF for the first parameter */
2429 int param_base = inst->base_mrf + inst->header_present;
2430
2431 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2432 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2433 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2434 } else {
2435 /* Load the coordinate */
2436 /* FINISHME: gl_clamp_mask and saturate */
2437 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2438 int zero_mask = 0xf & ~coord_mask;
2439
2440 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2441 coordinate));
2442
2443 if (zero_mask != 0) {
2444 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2445 src_reg(0)));
2446 }
2447 /* Load the shadow comparitor */
2448 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2449 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2450 WRITEMASK_X),
2451 shadow_comparitor));
2452 inst->mlen++;
2453 }
2454
2455 /* Load the LOD info */
2456 if (ir->op == ir_tex || ir->op == ir_txl) {
2457 int mrf, writemask;
2458 if (brw->gen >= 5) {
2459 mrf = param_base + 1;
2460 if (ir->shadow_comparitor) {
2461 writemask = WRITEMASK_Y;
2462 /* mlen already incremented */
2463 } else {
2464 writemask = WRITEMASK_X;
2465 inst->mlen++;
2466 }
2467 } else /* brw->gen == 4 */ {
2468 mrf = param_base;
2469 writemask = WRITEMASK_W;
2470 }
2471 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2472 } else if (ir->op == ir_txf) {
2473 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2474 } else if (ir->op == ir_txf_ms) {
2475 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2476 sample_index));
2477 if (brw->gen >= 7)
2478 /* MCS data is in the first channel of `mcs`, but we need to get it into
2479 * the .y channel of the second vec4 of params, so replicate .x across
2480 * the whole vec4 and then mask off everything except .y
2481 */
2482 mcs.swizzle = BRW_SWIZZLE_XXXX;
2483 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2484 mcs));
2485 inst->mlen++;
2486 } else if (ir->op == ir_txd) {
2487 const glsl_type *type = lod_type;
2488
2489 if (brw->gen >= 5) {
2490 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2491 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2492 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2493 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2494 inst->mlen++;
2495
2496 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2497 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2498 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2499 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2500 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2501 inst->mlen++;
2502
2503 if (ir->shadow_comparitor) {
2504 emit(MOV(dst_reg(MRF, param_base + 2,
2505 ir->shadow_comparitor->type, WRITEMASK_Z),
2506 shadow_comparitor));
2507 }
2508 }
2509 } else /* brw->gen == 4 */ {
2510 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2511 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2512 inst->mlen += 2;
2513 }
2514 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2515 if (ir->shadow_comparitor) {
2516 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2517 shadow_comparitor));
2518 }
2519
2520 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2521 offset_value));
2522 inst->mlen++;
2523 }
2524 }
2525
2526 emit(inst);
2527
2528 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2529 * spec requires layers.
2530 */
2531 if (ir->op == ir_txs) {
2532 glsl_type const *type = ir->sampler->type;
2533 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2534 type->sampler_array) {
2535 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2536 writemask(inst->dst, WRITEMASK_Z),
2537 src_reg(inst->dst), src_reg(6));
2538 }
2539 }
2540
2541 if (brw->gen == 6 && ir->op == ir_tg4) {
2542 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2543 }
2544
2545 swizzle_result(ir, src_reg(inst->dst), sampler);
2546 }
2547
2548 /**
2549 * Apply workarounds for Gen6 gather with UINT/SINT
2550 */
2551 void
2552 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2553 {
2554 if (!wa)
2555 return;
2556
2557 int width = (wa & WA_8BIT) ? 8 : 16;
2558 dst_reg dst_f = dst;
2559 dst_f.type = BRW_REGISTER_TYPE_F;
2560
2561 /* Convert from UNORM to UINT */
2562 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2563 emit(MOV(dst, src_reg(dst_f)));
2564
2565 if (wa & WA_SIGN) {
2566 /* Reinterpret the UINT value as a signed INT value by
2567 * shifting the sign bit into place, then shifting back
2568 * preserving sign.
2569 */
2570 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2571 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2572 }
2573 }
2574
2575 /**
2576 * Set up the gather channel based on the swizzle, for gather4.
2577 */
2578 uint32_t
2579 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2580 {
2581 ir_constant *chan = ir->lod_info.component->as_constant();
2582 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2583 switch (swiz) {
2584 case SWIZZLE_X: return 0;
2585 case SWIZZLE_Y:
2586 /* gather4 sampler is broken for green channel on RG32F --
2587 * we must ask for blue instead.
2588 */
2589 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2590 return 2;
2591 return 1;
2592 case SWIZZLE_Z: return 2;
2593 case SWIZZLE_W: return 3;
2594 default:
2595 assert(!"Not reached"); /* zero, one swizzles handled already */
2596 return 0;
2597 }
2598 }
2599
2600 void
2601 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2602 {
2603 int s = key->tex.swizzles[sampler];
2604
2605 this->result = src_reg(this, ir->type);
2606 dst_reg swizzled_result(this->result);
2607
2608 if (ir->op == ir_query_levels) {
2609 /* # levels is in .w */
2610 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2611 emit(MOV(swizzled_result, orig_val));
2612 return;
2613 }
2614
2615 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2616 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2617 emit(MOV(swizzled_result, orig_val));
2618 return;
2619 }
2620
2621
2622 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2623 int swizzle[4] = {0};
2624
2625 for (int i = 0; i < 4; i++) {
2626 switch (GET_SWZ(s, i)) {
2627 case SWIZZLE_ZERO:
2628 zero_mask |= (1 << i);
2629 break;
2630 case SWIZZLE_ONE:
2631 one_mask |= (1 << i);
2632 break;
2633 default:
2634 copy_mask |= (1 << i);
2635 swizzle[i] = GET_SWZ(s, i);
2636 break;
2637 }
2638 }
2639
2640 if (copy_mask) {
2641 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2642 swizzled_result.writemask = copy_mask;
2643 emit(MOV(swizzled_result, orig_val));
2644 }
2645
2646 if (zero_mask) {
2647 swizzled_result.writemask = zero_mask;
2648 emit(MOV(swizzled_result, src_reg(0.0f)));
2649 }
2650
2651 if (one_mask) {
2652 swizzled_result.writemask = one_mask;
2653 emit(MOV(swizzled_result, src_reg(1.0f)));
2654 }
2655 }
2656
2657 void
2658 vec4_visitor::visit(ir_return *ir)
2659 {
2660 assert(!"not reached");
2661 }
2662
2663 void
2664 vec4_visitor::visit(ir_discard *ir)
2665 {
2666 assert(!"not reached");
2667 }
2668
2669 void
2670 vec4_visitor::visit(ir_if *ir)
2671 {
2672 /* Don't point the annotation at the if statement, because then it plus
2673 * the then and else blocks get printed.
2674 */
2675 this->base_ir = ir->condition;
2676
2677 if (brw->gen == 6) {
2678 emit_if_gen6(ir);
2679 } else {
2680 uint32_t predicate;
2681 emit_bool_to_cond_code(ir->condition, &predicate);
2682 emit(IF(predicate));
2683 }
2684
2685 visit_instructions(&ir->then_instructions);
2686
2687 if (!ir->else_instructions.is_empty()) {
2688 this->base_ir = ir->condition;
2689 emit(BRW_OPCODE_ELSE);
2690
2691 visit_instructions(&ir->else_instructions);
2692 }
2693
2694 this->base_ir = ir->condition;
2695 emit(BRW_OPCODE_ENDIF);
2696 }
2697
2698 void
2699 vec4_visitor::visit(ir_emit_vertex *)
2700 {
2701 assert(!"not reached");
2702 }
2703
2704 void
2705 vec4_visitor::visit(ir_end_primitive *)
2706 {
2707 assert(!"not reached");
2708 }
2709
2710 void
2711 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2712 dst_reg dst, src_reg offset,
2713 src_reg src0, src_reg src1)
2714 {
2715 unsigned mlen = 0;
2716
2717 /* Set the atomic operation offset. */
2718 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2719 mlen++;
2720
2721 /* Set the atomic operation arguments. */
2722 if (src0.file != BAD_FILE) {
2723 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2724 mlen++;
2725 }
2726
2727 if (src1.file != BAD_FILE) {
2728 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2729 mlen++;
2730 }
2731
2732 /* Emit the instruction. Note that this maps to the normal SIMD8
2733 * untyped atomic message on Ivy Bridge, but that's OK because
2734 * unused channels will be masked out.
2735 */
2736 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2737 src_reg(atomic_op), src_reg(surf_index));
2738 inst->base_mrf = 0;
2739 inst->mlen = mlen;
2740 }
2741
2742 void
2743 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2744 src_reg offset)
2745 {
2746 /* Set the surface read offset. */
2747 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2748
2749 /* Emit the instruction. Note that this maps to the normal SIMD8
2750 * untyped surface read message, but that's OK because unused
2751 * channels will be masked out.
2752 */
2753 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2754 dst, src_reg(surf_index));
2755 inst->base_mrf = 0;
2756 inst->mlen = 1;
2757 }
2758
2759 void
2760 vec4_visitor::emit_ndc_computation()
2761 {
2762 /* Get the position */
2763 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2764
2765 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2766 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2767 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2768
2769 current_annotation = "NDC";
2770 dst_reg ndc_w = ndc;
2771 ndc_w.writemask = WRITEMASK_W;
2772 src_reg pos_w = pos;
2773 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2774 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2775
2776 dst_reg ndc_xyz = ndc;
2777 ndc_xyz.writemask = WRITEMASK_XYZ;
2778
2779 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2780 }
2781
2782 void
2783 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2784 {
2785 if (brw->gen < 6 &&
2786 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2787 key->userclip_active || brw->has_negative_rhw_bug)) {
2788 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2789 dst_reg header1_w = header1;
2790 header1_w.writemask = WRITEMASK_W;
2791
2792 emit(MOV(header1, 0u));
2793
2794 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2795 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2796
2797 current_annotation = "Point size";
2798 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2799 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2800 }
2801
2802 if (key->userclip_active) {
2803 current_annotation = "Clipping flags";
2804 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2805 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2806
2807 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2808 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2809 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2810
2811 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2812 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2813 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2814 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2815 }
2816
2817 /* i965 clipping workaround:
2818 * 1) Test for -ve rhw
2819 * 2) If set,
2820 * set ndc = (0,0,0,0)
2821 * set ucp[6] = 1
2822 *
2823 * Later, clipping will detect ucp[6] and ensure the primitive is
2824 * clipped against all fixed planes.
2825 */
2826 if (brw->has_negative_rhw_bug) {
2827 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2828 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2829 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2830 vec4_instruction *inst;
2831 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2832 inst->predicate = BRW_PREDICATE_NORMAL;
2833 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2834 inst->predicate = BRW_PREDICATE_NORMAL;
2835 }
2836
2837 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2838 } else if (brw->gen < 6) {
2839 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2840 } else {
2841 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2842 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2843 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2844 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2845 }
2846 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2847 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2848 src_reg(output_reg[VARYING_SLOT_LAYER])));
2849 }
2850 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2851 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2852 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2853 }
2854 }
2855 }
2856
2857 void
2858 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2859 {
2860 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2861 *
2862 * "If a linked set of shaders forming the vertex stage contains no
2863 * static write to gl_ClipVertex or gl_ClipDistance, but the
2864 * application has requested clipping against user clip planes through
2865 * the API, then the coordinate written to gl_Position is used for
2866 * comparison against the user clip planes."
2867 *
2868 * This function is only called if the shader didn't write to
2869 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2870 * if the user wrote to it; otherwise we use gl_Position.
2871 */
2872 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2873 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2874 clip_vertex = VARYING_SLOT_POS;
2875 }
2876
2877 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2878 ++i) {
2879 reg.writemask = 1 << i;
2880 emit(DP4(reg,
2881 src_reg(output_reg[clip_vertex]),
2882 src_reg(this->userplane[i + offset])));
2883 }
2884 }
2885
2886 void
2887 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2888 {
2889 assert (varying < VARYING_SLOT_MAX);
2890 reg.type = output_reg[varying].type;
2891 current_annotation = output_reg_annotation[varying];
2892 /* Copy the register, saturating if necessary */
2893 vec4_instruction *inst = emit(MOV(reg,
2894 src_reg(output_reg[varying])));
2895 if ((varying == VARYING_SLOT_COL0 ||
2896 varying == VARYING_SLOT_COL1 ||
2897 varying == VARYING_SLOT_BFC0 ||
2898 varying == VARYING_SLOT_BFC1) &&
2899 key->clamp_vertex_color) {
2900 inst->saturate = true;
2901 }
2902 }
2903
2904 void
2905 vec4_visitor::emit_urb_slot(int mrf, int varying)
2906 {
2907 struct brw_reg hw_reg = brw_message_reg(mrf);
2908 dst_reg reg = dst_reg(MRF, mrf);
2909 reg.type = BRW_REGISTER_TYPE_F;
2910
2911 switch (varying) {
2912 case VARYING_SLOT_PSIZ:
2913 /* PSIZ is always in slot 0, and is coupled with other flags. */
2914 current_annotation = "indices, point width, clip flags";
2915 emit_psiz_and_flags(hw_reg);
2916 break;
2917 case BRW_VARYING_SLOT_NDC:
2918 current_annotation = "NDC";
2919 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2920 break;
2921 case VARYING_SLOT_POS:
2922 current_annotation = "gl_Position";
2923 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2924 break;
2925 case VARYING_SLOT_EDGE:
2926 /* This is present when doing unfilled polygons. We're supposed to copy
2927 * the edge flag from the user-provided vertex array
2928 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2929 * of that attribute (starts as 1.0f). This is then used in clipping to
2930 * determine which edges should be drawn as wireframe.
2931 */
2932 current_annotation = "edge flag";
2933 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2934 glsl_type::float_type, WRITEMASK_XYZW))));
2935 break;
2936 case BRW_VARYING_SLOT_PAD:
2937 /* No need to write to this slot */
2938 break;
2939 default:
2940 emit_generic_urb_slot(reg, varying);
2941 break;
2942 }
2943 }
2944
2945 static int
2946 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2947 {
2948 if (brw->gen >= 6) {
2949 /* URB data written (does not include the message header reg) must
2950 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2951 * section 5.4.3.2.2: URB_INTERLEAVED.
2952 *
2953 * URB entries are allocated on a multiple of 1024 bits, so an
2954 * extra 128 bits written here to make the end align to 256 is
2955 * no problem.
2956 */
2957 if ((mlen % 2) != 1)
2958 mlen++;
2959 }
2960
2961 return mlen;
2962 }
2963
2964
2965 /**
2966 * Generates the VUE payload plus the necessary URB write instructions to
2967 * output it.
2968 *
2969 * The VUE layout is documented in Volume 2a.
2970 */
2971 void
2972 vec4_visitor::emit_vertex()
2973 {
2974 /* MRF 0 is reserved for the debugger, so start with message header
2975 * in MRF 1.
2976 */
2977 int base_mrf = 1;
2978 int mrf = base_mrf;
2979 /* In the process of generating our URB write message contents, we
2980 * may need to unspill a register or load from an array. Those
2981 * reads would use MRFs 14-15.
2982 */
2983 int max_usable_mrf = 13;
2984
2985 /* The following assertion verifies that max_usable_mrf causes an
2986 * even-numbered amount of URB write data, which will meet gen6's
2987 * requirements for length alignment.
2988 */
2989 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2990
2991 /* First mrf is the g0-based message header containing URB handles and
2992 * such.
2993 */
2994 emit_urb_write_header(mrf++);
2995
2996 if (brw->gen < 6) {
2997 emit_ndc_computation();
2998 }
2999
3000 /* Lower legacy ff and ClipVertex clipping to clip distances */
3001 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3002 current_annotation = "user clip distances";
3003
3004 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3005 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3006
3007 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3008 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3009 }
3010
3011 /* We may need to split this up into several URB writes, so do them in a
3012 * loop.
3013 */
3014 int slot = 0;
3015 bool complete = false;
3016 do {
3017 /* URB offset is in URB row increments, and each of our MRFs is half of
3018 * one of those, since we're doing interleaved writes.
3019 */
3020 int offset = slot / 2;
3021
3022 mrf = base_mrf + 1;
3023 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3024 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3025
3026 /* If this was max_usable_mrf, we can't fit anything more into this
3027 * URB WRITE.
3028 */
3029 if (mrf > max_usable_mrf) {
3030 slot++;
3031 break;
3032 }
3033 }
3034
3035 complete = slot >= prog_data->vue_map.num_slots;
3036 current_annotation = "URB write";
3037 vec4_instruction *inst = emit_urb_write_opcode(complete);
3038 inst->base_mrf = base_mrf;
3039 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3040 inst->offset += offset;
3041 } while(!complete);
3042 }
3043
3044
3045 src_reg
3046 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3047 src_reg *reladdr, int reg_offset)
3048 {
3049 /* Because we store the values to scratch interleaved like our
3050 * vertex data, we need to scale the vec4 index by 2.
3051 */
3052 int message_header_scale = 2;
3053
3054 /* Pre-gen6, the message header uses byte offsets instead of vec4
3055 * (16-byte) offset units.
3056 */
3057 if (brw->gen < 6)
3058 message_header_scale *= 16;
3059
3060 if (reladdr) {
3061 src_reg index = src_reg(this, glsl_type::int_type);
3062
3063 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3064 emit_before(inst, MUL(dst_reg(index),
3065 index, src_reg(message_header_scale)));
3066
3067 return index;
3068 } else {
3069 return src_reg(reg_offset * message_header_scale);
3070 }
3071 }
3072
3073 src_reg
3074 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3075 src_reg *reladdr, int reg_offset)
3076 {
3077 if (reladdr) {
3078 src_reg index = src_reg(this, glsl_type::int_type);
3079
3080 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3081
3082 /* Pre-gen6, the message header uses byte offsets instead of vec4
3083 * (16-byte) offset units.
3084 */
3085 if (brw->gen < 6) {
3086 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3087 }
3088
3089 return index;
3090 } else if (brw->gen >= 8) {
3091 /* Store the offset in a GRF so we can send-from-GRF. */
3092 src_reg offset = src_reg(this, glsl_type::int_type);
3093 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3094 return offset;
3095 } else {
3096 int message_header_scale = brw->gen < 6 ? 16 : 1;
3097 return src_reg(reg_offset * message_header_scale);
3098 }
3099 }
3100
3101 /**
3102 * Emits an instruction before @inst to load the value named by @orig_src
3103 * from scratch space at @base_offset to @temp.
3104 *
3105 * @base_offset is measured in 32-byte units (the size of a register).
3106 */
3107 void
3108 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3109 dst_reg temp, src_reg orig_src,
3110 int base_offset)
3111 {
3112 int reg_offset = base_offset + orig_src.reg_offset;
3113 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3114
3115 emit_before(inst, SCRATCH_READ(temp, index));
3116 }
3117
3118 /**
3119 * Emits an instruction after @inst to store the value to be written
3120 * to @orig_dst to scratch space at @base_offset, from @temp.
3121 *
3122 * @base_offset is measured in 32-byte units (the size of a register).
3123 */
3124 void
3125 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3126 {
3127 int reg_offset = base_offset + inst->dst.reg_offset;
3128 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3129
3130 /* Create a temporary register to store *inst's result in.
3131 *
3132 * We have to be careful in MOVing from our temporary result register in
3133 * the scratch write. If we swizzle from channels of the temporary that
3134 * weren't initialized, it will confuse live interval analysis, which will
3135 * make spilling fail to make progress.
3136 */
3137 src_reg temp = src_reg(this, glsl_type::vec4_type);
3138 temp.type = inst->dst.type;
3139 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3140 int swizzles[4];
3141 for (int i = 0; i < 4; i++)
3142 if (inst->dst.writemask & (1 << i))
3143 swizzles[i] = i;
3144 else
3145 swizzles[i] = first_writemask_chan;
3146 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3147 swizzles[2], swizzles[3]);
3148
3149 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3150 inst->dst.writemask));
3151 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3152 write->predicate = inst->predicate;
3153 write->ir = inst->ir;
3154 write->annotation = inst->annotation;
3155 inst->insert_after(write);
3156
3157 inst->dst.file = temp.file;
3158 inst->dst.reg = temp.reg;
3159 inst->dst.reg_offset = temp.reg_offset;
3160 inst->dst.reladdr = NULL;
3161 }
3162
3163 /**
3164 * We can't generally support array access in GRF space, because a
3165 * single instruction's destination can only span 2 contiguous
3166 * registers. So, we send all GRF arrays that get variable index
3167 * access to scratch space.
3168 */
3169 void
3170 vec4_visitor::move_grf_array_access_to_scratch()
3171 {
3172 int scratch_loc[this->virtual_grf_count];
3173
3174 for (int i = 0; i < this->virtual_grf_count; i++) {
3175 scratch_loc[i] = -1;
3176 }
3177
3178 /* First, calculate the set of virtual GRFs that need to be punted
3179 * to scratch due to having any array access on them, and where in
3180 * scratch.
3181 */
3182 foreach_list(node, &this->instructions) {
3183 vec4_instruction *inst = (vec4_instruction *)node;
3184
3185 if (inst->dst.file == GRF && inst->dst.reladdr &&
3186 scratch_loc[inst->dst.reg] == -1) {
3187 scratch_loc[inst->dst.reg] = c->last_scratch;
3188 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3189 }
3190
3191 for (int i = 0 ; i < 3; i++) {
3192 src_reg *src = &inst->src[i];
3193
3194 if (src->file == GRF && src->reladdr &&
3195 scratch_loc[src->reg] == -1) {
3196 scratch_loc[src->reg] = c->last_scratch;
3197 c->last_scratch += this->virtual_grf_sizes[src->reg];
3198 }
3199 }
3200 }
3201
3202 /* Now, for anything that will be accessed through scratch, rewrite
3203 * it to load/store. Note that this is a _safe list walk, because
3204 * we may generate a new scratch_write instruction after the one
3205 * we're processing.
3206 */
3207 foreach_list_safe(node, &this->instructions) {
3208 vec4_instruction *inst = (vec4_instruction *)node;
3209
3210 /* Set up the annotation tracking for new generated instructions. */
3211 base_ir = inst->ir;
3212 current_annotation = inst->annotation;
3213
3214 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3215 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3216 }
3217
3218 for (int i = 0 ; i < 3; i++) {
3219 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3220 continue;
3221
3222 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3223
3224 emit_scratch_read(inst, temp, inst->src[i],
3225 scratch_loc[inst->src[i].reg]);
3226
3227 inst->src[i].file = temp.file;
3228 inst->src[i].reg = temp.reg;
3229 inst->src[i].reg_offset = temp.reg_offset;
3230 inst->src[i].reladdr = NULL;
3231 }
3232 }
3233 }
3234
3235 /**
3236 * Emits an instruction before @inst to load the value named by @orig_src
3237 * from the pull constant buffer (surface) at @base_offset to @temp.
3238 */
3239 void
3240 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3241 dst_reg temp, src_reg orig_src,
3242 int base_offset)
3243 {
3244 int reg_offset = base_offset + orig_src.reg_offset;
3245 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3246 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3247 vec4_instruction *load;
3248
3249 if (brw->gen >= 7) {
3250 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3251 grf_offset.type = offset.type;
3252 emit_before(inst, MOV(grf_offset, offset));
3253
3254 load = new(mem_ctx) vec4_instruction(this,
3255 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3256 temp, index, src_reg(grf_offset));
3257 } else {
3258 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3259 temp, index, offset);
3260 load->base_mrf = 14;
3261 load->mlen = 1;
3262 }
3263 emit_before(inst, load);
3264 }
3265
3266 /**
3267 * Implements array access of uniforms by inserting a
3268 * PULL_CONSTANT_LOAD instruction.
3269 *
3270 * Unlike temporary GRF array access (where we don't support it due to
3271 * the difficulty of doing relative addressing on instruction
3272 * destinations), we could potentially do array access of uniforms
3273 * that were loaded in GRF space as push constants. In real-world
3274 * usage we've seen, though, the arrays being used are always larger
3275 * than we could load as push constants, so just always move all
3276 * uniform array access out to a pull constant buffer.
3277 */
3278 void
3279 vec4_visitor::move_uniform_array_access_to_pull_constants()
3280 {
3281 int pull_constant_loc[this->uniforms];
3282
3283 for (int i = 0; i < this->uniforms; i++) {
3284 pull_constant_loc[i] = -1;
3285 }
3286
3287 /* Walk through and find array access of uniforms. Put a copy of that
3288 * uniform in the pull constant buffer.
3289 *
3290 * Note that we don't move constant-indexed accesses to arrays. No
3291 * testing has been done of the performance impact of this choice.
3292 */
3293 foreach_list_safe(node, &this->instructions) {
3294 vec4_instruction *inst = (vec4_instruction *)node;
3295
3296 for (int i = 0 ; i < 3; i++) {
3297 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3298 continue;
3299
3300 int uniform = inst->src[i].reg;
3301
3302 /* If this array isn't already present in the pull constant buffer,
3303 * add it.
3304 */
3305 if (pull_constant_loc[uniform] == -1) {
3306 const float **values = &stage_prog_data->param[uniform * 4];
3307
3308 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3309
3310 assert(uniform < uniform_array_size);
3311 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3312 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3313 = values[j];
3314 }
3315 }
3316
3317 /* Set up the annotation tracking for new generated instructions. */
3318 base_ir = inst->ir;
3319 current_annotation = inst->annotation;
3320
3321 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3322
3323 emit_pull_constant_load(inst, temp, inst->src[i],
3324 pull_constant_loc[uniform]);
3325
3326 inst->src[i].file = temp.file;
3327 inst->src[i].reg = temp.reg;
3328 inst->src[i].reg_offset = temp.reg_offset;
3329 inst->src[i].reladdr = NULL;
3330 }
3331 }
3332
3333 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3334 * no need to track them as larger-than-vec4 objects. This will be
3335 * relied on in cutting out unused uniform vectors from push
3336 * constants.
3337 */
3338 split_uniform_registers();
3339 }
3340
3341 void
3342 vec4_visitor::resolve_ud_negate(src_reg *reg)
3343 {
3344 if (reg->type != BRW_REGISTER_TYPE_UD ||
3345 !reg->negate)
3346 return;
3347
3348 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3349 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3350 *reg = temp;
3351 }
3352
3353 vec4_visitor::vec4_visitor(struct brw_context *brw,
3354 struct brw_vec4_compile *c,
3355 struct gl_program *prog,
3356 const struct brw_vec4_prog_key *key,
3357 struct brw_vec4_prog_data *prog_data,
3358 struct gl_shader_program *shader_prog,
3359 gl_shader_stage stage,
3360 void *mem_ctx,
3361 bool debug_flag,
3362 bool no_spills,
3363 shader_time_shader_type st_base,
3364 shader_time_shader_type st_written,
3365 shader_time_shader_type st_reset)
3366 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3367 c(c),
3368 key(key),
3369 prog_data(prog_data),
3370 sanity_param_count(0),
3371 fail_msg(NULL),
3372 first_non_payload_grf(0),
3373 need_all_constants_in_pull_buffer(false),
3374 debug_flag(debug_flag),
3375 no_spills(no_spills),
3376 st_base(st_base),
3377 st_written(st_written),
3378 st_reset(st_reset)
3379 {
3380 this->mem_ctx = mem_ctx;
3381 this->failed = false;
3382
3383 this->base_ir = NULL;
3384 this->current_annotation = NULL;
3385 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3386
3387 this->variable_ht = hash_table_ctor(0,
3388 hash_table_pointer_hash,
3389 hash_table_pointer_compare);
3390
3391 this->virtual_grf_start = NULL;
3392 this->virtual_grf_end = NULL;
3393 this->virtual_grf_sizes = NULL;
3394 this->virtual_grf_count = 0;
3395 this->virtual_grf_reg_map = NULL;
3396 this->virtual_grf_reg_count = 0;
3397 this->virtual_grf_array_size = 0;
3398 this->live_intervals_valid = false;
3399
3400 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3401
3402 this->uniforms = 0;
3403
3404 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3405 * at least one. See setup_uniforms() in brw_vec4.cpp.
3406 */
3407 this->uniform_array_size = 1;
3408 if (prog_data) {
3409 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3410 }
3411
3412 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3413 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3414 }
3415
3416 vec4_visitor::~vec4_visitor()
3417 {
3418 hash_table_dtor(this->variable_ht);
3419 }
3420
3421
3422 void
3423 vec4_visitor::fail(const char *format, ...)
3424 {
3425 va_list va;
3426 char *msg;
3427
3428 if (failed)
3429 return;
3430
3431 failed = true;
3432
3433 va_start(va, format);
3434 msg = ralloc_vasprintf(mem_ctx, format, va);
3435 va_end(va);
3436 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3437
3438 this->fail_msg = msg;
3439
3440 if (debug_flag) {
3441 fprintf(stderr, "%s", msg);
3442 }
3443 }
3444
3445 } /* namespace brw */