i965/vec4: Change vec4_visitor::emit_lrp to use MAC for gen<6
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->writes_accumulator = false;
46 this->conditional_mod = BRW_CONDITIONAL_NONE;
47 this->sampler = 0;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(dst_reg dst, src_reg src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
123 { \
124 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
125 src0, src1); \
126 }
127
128 #define ALU2_ACC(op) \
129 vec4_instruction * \
130 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
131 { \
132 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
133 BRW_OPCODE_##op, dst, src0, src1); \
134 inst->writes_accumulator = true; \
135 return inst; \
136 }
137
138 #define ALU3(op) \
139 vec4_instruction * \
140 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
141 { \
142 assert(brw->gen >= 6); \
143 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
144 src0, src1, src2); \
145 }
146
147 ALU1(NOT)
148 ALU1(MOV)
149 ALU1(FRC)
150 ALU1(RNDD)
151 ALU1(RNDE)
152 ALU1(RNDZ)
153 ALU1(F32TO16)
154 ALU1(F16TO32)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2_ACC(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(DP3)
162 ALU2(DP4)
163 ALU2(DPH)
164 ALU2(SHL)
165 ALU2(SHR)
166 ALU2(ASR)
167 ALU3(LRP)
168 ALU1(BFREV)
169 ALU3(BFE)
170 ALU2(BFI1)
171 ALU3(BFI2)
172 ALU1(FBH)
173 ALU1(FBL)
174 ALU1(CBIT)
175 ALU3(MAD)
176 ALU2_ACC(ADDC)
177 ALU2_ACC(SUBB)
178 ALU2(MAC)
179
180 /** Gen4 predicated IF. */
181 vec4_instruction *
182 vec4_visitor::IF(uint32_t predicate)
183 {
184 vec4_instruction *inst;
185
186 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
187 inst->predicate = predicate;
188
189 return inst;
190 }
191
192 /** Gen6 IF with embedded comparison. */
193 vec4_instruction *
194 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
195 {
196 assert(brw->gen == 6);
197
198 vec4_instruction *inst;
199
200 resolve_ud_negate(&src0);
201 resolve_ud_negate(&src1);
202
203 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
204 src0, src1);
205 inst->conditional_mod = condition;
206
207 return inst;
208 }
209
210 /**
211 * CMP: Sets the low bit of the destination channels with the result
212 * of the comparison, while the upper bits are undefined, and updates
213 * the flag register with the packed 16 bits of the result.
214 */
215 vec4_instruction *
216 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
217 {
218 vec4_instruction *inst;
219
220 /* original gen4 does type conversion to the destination type
221 * before before comparison, producing garbage results for floating
222 * point comparisons.
223 */
224 if (brw->gen == 4) {
225 dst.type = src0.type;
226 if (dst.file == HW_REG)
227 dst.fixed_hw_reg.type = dst.type;
228 }
229
230 resolve_ud_negate(&src0);
231 resolve_ud_negate(&src1);
232
233 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
234 inst->conditional_mod = condition;
235
236 return inst;
237 }
238
239 vec4_instruction *
240 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
241 {
242 vec4_instruction *inst;
243
244 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
245 dst, index);
246 inst->base_mrf = 14;
247 inst->mlen = 2;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
258 dst, src, index);
259 inst->base_mrf = 13;
260 inst->mlen = 3;
261
262 return inst;
263 }
264
265 void
266 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
267 {
268 static enum opcode dot_opcodes[] = {
269 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
270 };
271
272 emit(dot_opcodes[elements - 2], dst, src0, src1);
273 }
274
275 src_reg
276 vec4_visitor::fix_3src_operand(src_reg src)
277 {
278 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
279 * able to use vertical stride of zero to replicate the vec4 uniform, like
280 *
281 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
282 *
283 * But you can't, since vertical stride is always four in three-source
284 * instructions. Instead, insert a MOV instruction to do the replication so
285 * that the three-source instruction can consume it.
286 */
287
288 /* The MOV is only needed if the source is a uniform or immediate. */
289 if (src.file != UNIFORM && src.file != IMM)
290 return src;
291
292 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
293 return src;
294
295 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
296 expanded.type = src.type;
297 emit(MOV(expanded, src));
298 return src_reg(expanded);
299 }
300
301 src_reg
302 vec4_visitor::fix_math_operand(src_reg src)
303 {
304 /* The gen6 math instruction ignores the source modifiers --
305 * swizzle, abs, negate, and at least some parts of the register
306 * region description.
307 *
308 * Rather than trying to enumerate all these cases, *always* expand the
309 * operand to a temp GRF for gen6.
310 *
311 * For gen7, keep the operand as-is, except if immediate, which gen7 still
312 * can't use.
313 */
314
315 if (brw->gen == 7 && src.file != IMM)
316 return src;
317
318 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
319 expanded.type = src.type;
320 emit(MOV(expanded, src));
321 return src_reg(expanded);
322 }
323
324 void
325 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
326 {
327 src = fix_math_operand(src);
328
329 if (dst.writemask != WRITEMASK_XYZW) {
330 /* The gen6 math instruction must be align1, so we can't do
331 * writemasks.
332 */
333 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
334
335 emit(opcode, temp_dst, src);
336
337 emit(MOV(dst, src_reg(temp_dst)));
338 } else {
339 emit(opcode, dst, src);
340 }
341 }
342
343 void
344 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
345 {
346 vec4_instruction *inst = emit(opcode, dst, src);
347 inst->base_mrf = 1;
348 inst->mlen = 1;
349 }
350
351 void
352 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
353 {
354 switch (opcode) {
355 case SHADER_OPCODE_RCP:
356 case SHADER_OPCODE_RSQ:
357 case SHADER_OPCODE_SQRT:
358 case SHADER_OPCODE_EXP2:
359 case SHADER_OPCODE_LOG2:
360 case SHADER_OPCODE_SIN:
361 case SHADER_OPCODE_COS:
362 break;
363 default:
364 assert(!"not reached: bad math opcode");
365 return;
366 }
367
368 if (brw->gen >= 6) {
369 return emit_math1_gen6(opcode, dst, src);
370 } else {
371 return emit_math1_gen4(opcode, dst, src);
372 }
373 }
374
375 void
376 vec4_visitor::emit_math2_gen6(enum opcode opcode,
377 dst_reg dst, src_reg src0, src_reg src1)
378 {
379 src0 = fix_math_operand(src0);
380 src1 = fix_math_operand(src1);
381
382 if (dst.writemask != WRITEMASK_XYZW) {
383 /* The gen6 math instruction must be align1, so we can't do
384 * writemasks.
385 */
386 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
387 temp_dst.type = dst.type;
388
389 emit(opcode, temp_dst, src0, src1);
390
391 emit(MOV(dst, src_reg(temp_dst)));
392 } else {
393 emit(opcode, dst, src0, src1);
394 }
395 }
396
397 void
398 vec4_visitor::emit_math2_gen4(enum opcode opcode,
399 dst_reg dst, src_reg src0, src_reg src1)
400 {
401 vec4_instruction *inst = emit(opcode, dst, src0, src1);
402 inst->base_mrf = 1;
403 inst->mlen = 2;
404 }
405
406 void
407 vec4_visitor::emit_math(enum opcode opcode,
408 dst_reg dst, src_reg src0, src_reg src1)
409 {
410 switch (opcode) {
411 case SHADER_OPCODE_POW:
412 case SHADER_OPCODE_INT_QUOTIENT:
413 case SHADER_OPCODE_INT_REMAINDER:
414 break;
415 default:
416 assert(!"not reached: unsupported binary math opcode");
417 return;
418 }
419
420 if (brw->gen >= 6) {
421 return emit_math2_gen6(opcode, dst, src0, src1);
422 } else {
423 return emit_math2_gen4(opcode, dst, src0, src1);
424 }
425 }
426
427 void
428 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
429 {
430 if (brw->gen < 7)
431 assert(!"ir_unop_pack_half_2x16 should be lowered");
432
433 assert(dst.type == BRW_REGISTER_TYPE_UD);
434 assert(src0.type == BRW_REGISTER_TYPE_F);
435
436 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
437 *
438 * Because this instruction does not have a 16-bit floating-point type,
439 * the destination data type must be Word (W).
440 *
441 * The destination must be DWord-aligned and specify a horizontal stride
442 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
443 * each destination channel and the upper word is not modified.
444 *
445 * The above restriction implies that the f32to16 instruction must use
446 * align1 mode, because only in align1 mode is it possible to specify
447 * horizontal stride. We choose here to defy the hardware docs and emit
448 * align16 instructions.
449 *
450 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
451 * instructions. I was partially successful in that the code passed all
452 * tests. However, the code was dubiously correct and fragile, and the
453 * tests were not harsh enough to probe that frailty. Not trusting the
454 * code, I chose instead to remain in align16 mode in defiance of the hw
455 * docs).
456 *
457 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
458 * simulator, emitting a f32to16 in align16 mode with UD as destination
459 * data type is safe. The behavior differs from that specified in the PRM
460 * in that the upper word of each destination channel is cleared to 0.
461 */
462
463 dst_reg tmp_dst(this, glsl_type::uvec2_type);
464 src_reg tmp_src(tmp_dst);
465
466 #if 0
467 /* Verify the undocumented behavior on which the following instructions
468 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
469 * then the result of the bit-or instruction below will be incorrect.
470 *
471 * You should inspect the disasm output in order to verify that the MOV is
472 * not optimized away.
473 */
474 emit(MOV(tmp_dst, src_reg(0x12345678u)));
475 #endif
476
477 /* Give tmp the form below, where "." means untouched.
478 *
479 * w z y x w z y x
480 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
481 *
482 * That the upper word of each write-channel be 0 is required for the
483 * following bit-shift and bit-or instructions to work. Note that this
484 * relies on the undocumented hardware behavior mentioned above.
485 */
486 tmp_dst.writemask = WRITEMASK_XY;
487 emit(F32TO16(tmp_dst, src0));
488
489 /* Give the write-channels of dst the form:
490 * 0xhhhh0000
491 */
492 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
493 emit(SHL(dst, tmp_src, src_reg(16u)));
494
495 /* Finally, give the write-channels of dst the form of packHalf2x16's
496 * output:
497 * 0xhhhhllll
498 */
499 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
500 emit(OR(dst, src_reg(dst), tmp_src));
501 }
502
503 void
504 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
505 {
506 if (brw->gen < 7)
507 assert(!"ir_unop_unpack_half_2x16 should be lowered");
508
509 assert(dst.type == BRW_REGISTER_TYPE_F);
510 assert(src0.type == BRW_REGISTER_TYPE_UD);
511
512 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
513 *
514 * Because this instruction does not have a 16-bit floating-point type,
515 * the source data type must be Word (W). The destination type must be
516 * F (Float).
517 *
518 * To use W as the source data type, we must adjust horizontal strides,
519 * which is only possible in align1 mode. All my [chadv] attempts at
520 * emitting align1 instructions for unpackHalf2x16 failed to pass the
521 * Piglit tests, so I gave up.
522 *
523 * I've verified that, on gen7 hardware and the simulator, it is safe to
524 * emit f16to32 in align16 mode with UD as source data type.
525 */
526
527 dst_reg tmp_dst(this, glsl_type::uvec2_type);
528 src_reg tmp_src(tmp_dst);
529
530 tmp_dst.writemask = WRITEMASK_X;
531 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
532
533 tmp_dst.writemask = WRITEMASK_Y;
534 emit(SHR(tmp_dst, src0, src_reg(16u)));
535
536 dst.writemask = WRITEMASK_XY;
537 emit(F16TO32(dst, tmp_src));
538 }
539
540 void
541 vec4_visitor::visit_instructions(const exec_list *list)
542 {
543 foreach_list(node, list) {
544 ir_instruction *ir = (ir_instruction *)node;
545
546 base_ir = ir;
547 ir->accept(this);
548 }
549 }
550
551
552 static int
553 type_size(const struct glsl_type *type)
554 {
555 unsigned int i;
556 int size;
557
558 switch (type->base_type) {
559 case GLSL_TYPE_UINT:
560 case GLSL_TYPE_INT:
561 case GLSL_TYPE_FLOAT:
562 case GLSL_TYPE_BOOL:
563 if (type->is_matrix()) {
564 return type->matrix_columns;
565 } else {
566 /* Regardless of size of vector, it gets a vec4. This is bad
567 * packing for things like floats, but otherwise arrays become a
568 * mess. Hopefully a later pass over the code can pack scalars
569 * down if appropriate.
570 */
571 return 1;
572 }
573 case GLSL_TYPE_ARRAY:
574 assert(type->length > 0);
575 return type_size(type->fields.array) * type->length;
576 case GLSL_TYPE_STRUCT:
577 size = 0;
578 for (i = 0; i < type->length; i++) {
579 size += type_size(type->fields.structure[i].type);
580 }
581 return size;
582 case GLSL_TYPE_SAMPLER:
583 /* Samplers take up one slot in UNIFORMS[], but they're baked in
584 * at link time.
585 */
586 return 1;
587 case GLSL_TYPE_ATOMIC_UINT:
588 return 0;
589 case GLSL_TYPE_IMAGE:
590 case GLSL_TYPE_VOID:
591 case GLSL_TYPE_ERROR:
592 case GLSL_TYPE_INTERFACE:
593 assert(0);
594 break;
595 }
596
597 return 0;
598 }
599
600 int
601 vec4_visitor::virtual_grf_alloc(int size)
602 {
603 if (virtual_grf_array_size <= virtual_grf_count) {
604 if (virtual_grf_array_size == 0)
605 virtual_grf_array_size = 16;
606 else
607 virtual_grf_array_size *= 2;
608 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
609 virtual_grf_array_size);
610 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
611 virtual_grf_array_size);
612 }
613 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
614 virtual_grf_reg_count += size;
615 virtual_grf_sizes[virtual_grf_count] = size;
616 return virtual_grf_count++;
617 }
618
619 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
620 {
621 init();
622
623 this->file = GRF;
624 this->reg = v->virtual_grf_alloc(type_size(type));
625
626 if (type->is_array() || type->is_record()) {
627 this->swizzle = BRW_SWIZZLE_NOOP;
628 } else {
629 this->swizzle = swizzle_for_size(type->vector_elements);
630 }
631
632 this->type = brw_type_for_base_type(type);
633 }
634
635 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
636 {
637 init();
638
639 this->file = GRF;
640 this->reg = v->virtual_grf_alloc(type_size(type));
641
642 if (type->is_array() || type->is_record()) {
643 this->writemask = WRITEMASK_XYZW;
644 } else {
645 this->writemask = (1 << type->vector_elements) - 1;
646 }
647
648 this->type = brw_type_for_base_type(type);
649 }
650
651 /* Our support for uniforms is piggy-backed on the struct
652 * gl_fragment_program, because that's where the values actually
653 * get stored, rather than in some global gl_shader_program uniform
654 * store.
655 */
656 void
657 vec4_visitor::setup_uniform_values(ir_variable *ir)
658 {
659 int namelen = strlen(ir->name);
660
661 /* The data for our (non-builtin) uniforms is stored in a series of
662 * gl_uniform_driver_storage structs for each subcomponent that
663 * glGetUniformLocation() could name. We know it's been set up in the same
664 * order we'd walk the type, so walk the list of storage and find anything
665 * with our name, or the prefix of a component that starts with our name.
666 */
667 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
668 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
669
670 if (strncmp(ir->name, storage->name, namelen) != 0 ||
671 (storage->name[namelen] != 0 &&
672 storage->name[namelen] != '.' &&
673 storage->name[namelen] != '[')) {
674 continue;
675 }
676
677 gl_constant_value *components = storage->storage;
678 unsigned vector_count = (MAX2(storage->array_elements, 1) *
679 storage->type->matrix_columns);
680
681 for (unsigned s = 0; s < vector_count; s++) {
682 assert(uniforms < uniform_array_size);
683 uniform_vector_size[uniforms] = storage->type->vector_elements;
684
685 int i;
686 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
687 stage_prog_data->param[uniforms * 4 + i] = &components->f;
688 components++;
689 }
690 for (; i < 4; i++) {
691 static float zero = 0;
692 stage_prog_data->param[uniforms * 4 + i] = &zero;
693 }
694
695 uniforms++;
696 }
697 }
698 }
699
700 void
701 vec4_visitor::setup_uniform_clipplane_values()
702 {
703 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
704
705 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
706 assert(this->uniforms < uniform_array_size);
707 this->uniform_vector_size[this->uniforms] = 4;
708 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
709 this->userplane[i].type = BRW_REGISTER_TYPE_F;
710 for (int j = 0; j < 4; ++j) {
711 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
712 }
713 ++this->uniforms;
714 }
715 }
716
717 /* Our support for builtin uniforms is even scarier than non-builtin.
718 * It sits on top of the PROG_STATE_VAR parameters that are
719 * automatically updated from GL context state.
720 */
721 void
722 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
723 {
724 const ir_state_slot *const slots = ir->state_slots;
725 assert(ir->state_slots != NULL);
726
727 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
728 /* This state reference has already been setup by ir_to_mesa,
729 * but we'll get the same index back here. We can reference
730 * ParameterValues directly, since unlike brw_fs.cpp, we never
731 * add new state references during compile.
732 */
733 int index = _mesa_add_state_reference(this->prog->Parameters,
734 (gl_state_index *)slots[i].tokens);
735 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
736
737 assert(this->uniforms < uniform_array_size);
738 this->uniform_vector_size[this->uniforms] = 0;
739 /* Add each of the unique swizzled channels of the element.
740 * This will end up matching the size of the glsl_type of this field.
741 */
742 int last_swiz = -1;
743 for (unsigned int j = 0; j < 4; j++) {
744 int swiz = GET_SWZ(slots[i].swizzle, j);
745 last_swiz = swiz;
746
747 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
748 assert(this->uniforms < uniform_array_size);
749 if (swiz <= last_swiz)
750 this->uniform_vector_size[this->uniforms]++;
751 }
752 this->uniforms++;
753 }
754 }
755
756 dst_reg *
757 vec4_visitor::variable_storage(ir_variable *var)
758 {
759 return (dst_reg *)hash_table_find(this->variable_ht, var);
760 }
761
762 void
763 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
764 {
765 ir_expression *expr = ir->as_expression();
766
767 *predicate = BRW_PREDICATE_NORMAL;
768
769 if (expr) {
770 src_reg op[2];
771 vec4_instruction *inst;
772
773 assert(expr->get_num_operands() <= 2);
774 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
775 expr->operands[i]->accept(this);
776 op[i] = this->result;
777
778 resolve_ud_negate(&op[i]);
779 }
780
781 switch (expr->operation) {
782 case ir_unop_logic_not:
783 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
784 inst->conditional_mod = BRW_CONDITIONAL_Z;
785 break;
786
787 case ir_binop_logic_xor:
788 inst = emit(XOR(dst_null_d(), op[0], op[1]));
789 inst->conditional_mod = BRW_CONDITIONAL_NZ;
790 break;
791
792 case ir_binop_logic_or:
793 inst = emit(OR(dst_null_d(), op[0], op[1]));
794 inst->conditional_mod = BRW_CONDITIONAL_NZ;
795 break;
796
797 case ir_binop_logic_and:
798 inst = emit(AND(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_unop_f2b:
803 if (brw->gen >= 6) {
804 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
805 } else {
806 inst = emit(MOV(dst_null_f(), op[0]));
807 inst->conditional_mod = BRW_CONDITIONAL_NZ;
808 }
809 break;
810
811 case ir_unop_i2b:
812 if (brw->gen >= 6) {
813 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
814 } else {
815 inst = emit(MOV(dst_null_d(), op[0]));
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 }
818 break;
819
820 case ir_binop_all_equal:
821 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
822 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
823 break;
824
825 case ir_binop_any_nequal:
826 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
827 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
828 break;
829
830 case ir_unop_any:
831 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
832 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
833 break;
834
835 case ir_binop_greater:
836 case ir_binop_gequal:
837 case ir_binop_less:
838 case ir_binop_lequal:
839 case ir_binop_equal:
840 case ir_binop_nequal:
841 emit(CMP(dst_null_d(), op[0], op[1],
842 brw_conditional_for_comparison(expr->operation)));
843 break;
844
845 default:
846 assert(!"not reached");
847 break;
848 }
849 return;
850 }
851
852 ir->accept(this);
853
854 resolve_ud_negate(&this->result);
855
856 if (brw->gen >= 6) {
857 vec4_instruction *inst = emit(AND(dst_null_d(),
858 this->result, src_reg(1)));
859 inst->conditional_mod = BRW_CONDITIONAL_NZ;
860 } else {
861 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
862 inst->conditional_mod = BRW_CONDITIONAL_NZ;
863 }
864 }
865
866 /**
867 * Emit a gen6 IF statement with the comparison folded into the IF
868 * instruction.
869 */
870 void
871 vec4_visitor::emit_if_gen6(ir_if *ir)
872 {
873 ir_expression *expr = ir->condition->as_expression();
874
875 if (expr) {
876 src_reg op[2];
877 dst_reg temp;
878
879 assert(expr->get_num_operands() <= 2);
880 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
881 expr->operands[i]->accept(this);
882 op[i] = this->result;
883 }
884
885 switch (expr->operation) {
886 case ir_unop_logic_not:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
888 return;
889
890 case ir_binop_logic_xor:
891 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_binop_logic_or:
895 temp = dst_reg(this, glsl_type::bool_type);
896 emit(OR(temp, op[0], op[1]));
897 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
898 return;
899
900 case ir_binop_logic_and:
901 temp = dst_reg(this, glsl_type::bool_type);
902 emit(AND(temp, op[0], op[1]));
903 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
904 return;
905
906 case ir_unop_f2b:
907 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
908 return;
909
910 case ir_unop_i2b:
911 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
912 return;
913
914 case ir_binop_greater:
915 case ir_binop_gequal:
916 case ir_binop_less:
917 case ir_binop_lequal:
918 case ir_binop_equal:
919 case ir_binop_nequal:
920 emit(IF(op[0], op[1],
921 brw_conditional_for_comparison(expr->operation)));
922 return;
923
924 case ir_binop_all_equal:
925 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
926 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
927 return;
928
929 case ir_binop_any_nequal:
930 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
931 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
932 return;
933
934 case ir_unop_any:
935 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
936 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
937 return;
938
939 default:
940 assert(!"not reached");
941 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
942 return;
943 }
944 return;
945 }
946
947 ir->condition->accept(this);
948
949 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
950 }
951
952 void
953 vec4_visitor::visit(ir_variable *ir)
954 {
955 dst_reg *reg = NULL;
956
957 if (variable_storage(ir))
958 return;
959
960 switch (ir->data.mode) {
961 case ir_var_shader_in:
962 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
963 break;
964
965 case ir_var_shader_out:
966 reg = new(mem_ctx) dst_reg(this, ir->type);
967
968 for (int i = 0; i < type_size(ir->type); i++) {
969 output_reg[ir->data.location + i] = *reg;
970 output_reg[ir->data.location + i].reg_offset = i;
971 output_reg[ir->data.location + i].type =
972 brw_type_for_base_type(ir->type->get_scalar_type());
973 output_reg_annotation[ir->data.location + i] = ir->name;
974 }
975 break;
976
977 case ir_var_auto:
978 case ir_var_temporary:
979 reg = new(mem_ctx) dst_reg(this, ir->type);
980 break;
981
982 case ir_var_uniform:
983 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
984
985 /* Thanks to the lower_ubo_reference pass, we will see only
986 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
987 * variables, so no need for them to be in variable_ht.
988 *
989 * Atomic counters take no uniform storage, no need to do
990 * anything here.
991 */
992 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
993 return;
994
995 /* Track how big the whole uniform variable is, in case we need to put a
996 * copy of its data into pull constants for array access.
997 */
998 assert(this->uniforms < uniform_array_size);
999 this->uniform_size[this->uniforms] = type_size(ir->type);
1000
1001 if (!strncmp(ir->name, "gl_", 3)) {
1002 setup_builtin_uniform_values(ir);
1003 } else {
1004 setup_uniform_values(ir);
1005 }
1006 break;
1007
1008 case ir_var_system_value:
1009 reg = make_reg_for_system_value(ir);
1010 break;
1011
1012 default:
1013 assert(!"not reached");
1014 }
1015
1016 reg->type = brw_type_for_base_type(ir->type);
1017 hash_table_insert(this->variable_ht, reg, ir);
1018 }
1019
1020 void
1021 vec4_visitor::visit(ir_loop *ir)
1022 {
1023 /* We don't want debugging output to print the whole body of the
1024 * loop as the annotation.
1025 */
1026 this->base_ir = NULL;
1027
1028 emit(BRW_OPCODE_DO);
1029
1030 visit_instructions(&ir->body_instructions);
1031
1032 emit(BRW_OPCODE_WHILE);
1033 }
1034
1035 void
1036 vec4_visitor::visit(ir_loop_jump *ir)
1037 {
1038 switch (ir->mode) {
1039 case ir_loop_jump::jump_break:
1040 emit(BRW_OPCODE_BREAK);
1041 break;
1042 case ir_loop_jump::jump_continue:
1043 emit(BRW_OPCODE_CONTINUE);
1044 break;
1045 }
1046 }
1047
1048
1049 void
1050 vec4_visitor::visit(ir_function_signature *ir)
1051 {
1052 assert(0);
1053 (void)ir;
1054 }
1055
1056 void
1057 vec4_visitor::visit(ir_function *ir)
1058 {
1059 /* Ignore function bodies other than main() -- we shouldn't see calls to
1060 * them since they should all be inlined.
1061 */
1062 if (strcmp(ir->name, "main") == 0) {
1063 const ir_function_signature *sig;
1064 exec_list empty;
1065
1066 sig = ir->matching_signature(NULL, &empty);
1067
1068 assert(sig);
1069
1070 visit_instructions(&sig->body);
1071 }
1072 }
1073
1074 bool
1075 vec4_visitor::try_emit_sat(ir_expression *ir)
1076 {
1077 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1078 if (!sat_src)
1079 return false;
1080
1081 sat_src->accept(this);
1082 src_reg src = this->result;
1083
1084 this->result = src_reg(this, ir->type);
1085 vec4_instruction *inst;
1086 inst = emit(MOV(dst_reg(this->result), src));
1087 inst->saturate = true;
1088
1089 return true;
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1094 {
1095 /* 3-src instructions were introduced in gen6. */
1096 if (brw->gen < 6)
1097 return false;
1098
1099 /* MAD can only handle floating-point data. */
1100 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1101 return false;
1102
1103 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1104 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1105
1106 if (!mul || mul->operation != ir_binop_mul)
1107 return false;
1108
1109 nonmul->accept(this);
1110 src_reg src0 = fix_3src_operand(this->result);
1111
1112 mul->operands[0]->accept(this);
1113 src_reg src1 = fix_3src_operand(this->result);
1114
1115 mul->operands[1]->accept(this);
1116 src_reg src2 = fix_3src_operand(this->result);
1117
1118 this->result = src_reg(this, ir->type);
1119 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1120
1121 return true;
1122 }
1123
1124 void
1125 vec4_visitor::emit_bool_comparison(unsigned int op,
1126 dst_reg dst, src_reg src0, src_reg src1)
1127 {
1128 /* original gen4 does destination conversion before comparison. */
1129 if (brw->gen < 5)
1130 dst.type = src0.type;
1131
1132 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1133
1134 dst.type = BRW_REGISTER_TYPE_D;
1135 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1136 }
1137
1138 void
1139 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1140 src_reg src0, src_reg src1)
1141 {
1142 vec4_instruction *inst;
1143
1144 if (brw->gen >= 6) {
1145 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1146 inst->conditional_mod = conditionalmod;
1147 } else {
1148 emit(CMP(dst, src0, src1, conditionalmod));
1149
1150 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1151 inst->predicate = BRW_PREDICATE_NORMAL;
1152 }
1153 }
1154
1155 void
1156 vec4_visitor::emit_lrp(const dst_reg &dst,
1157 const src_reg &x, const src_reg &y, const src_reg &a)
1158 {
1159 if (brw->gen >= 6) {
1160 /* Note that the instruction's argument order is reversed from GLSL
1161 * and the IR.
1162 */
1163 emit(LRP(dst,
1164 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1165 } else {
1166 /* Earlier generations don't support three source operations, so we
1167 * need to emit x*(1-a) + y*a.
1168 */
1169 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1170 one_minus_a.writemask = dst.writemask;
1171
1172 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1173 vec4_instruction *mul = emit(MUL(dst_null_f(), y, a));
1174 mul->writes_accumulator = true;
1175 emit(MAC(dst, x, src_reg(one_minus_a)));
1176 }
1177 }
1178
1179 void
1180 vec4_visitor::visit(ir_expression *ir)
1181 {
1182 unsigned int operand;
1183 src_reg op[Elements(ir->operands)];
1184 src_reg result_src;
1185 dst_reg result_dst;
1186 vec4_instruction *inst;
1187
1188 if (try_emit_sat(ir))
1189 return;
1190
1191 if (ir->operation == ir_binop_add) {
1192 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1193 return;
1194 }
1195
1196 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1197 this->result.file = BAD_FILE;
1198 ir->operands[operand]->accept(this);
1199 if (this->result.file == BAD_FILE) {
1200 fprintf(stderr, "Failed to get tree for expression operand:\n");
1201 ir->operands[operand]->fprint(stderr);
1202 exit(1);
1203 }
1204 op[operand] = this->result;
1205
1206 /* Matrix expression operands should have been broken down to vector
1207 * operations already.
1208 */
1209 assert(!ir->operands[operand]->type->is_matrix());
1210 }
1211
1212 int vector_elements = ir->operands[0]->type->vector_elements;
1213 if (ir->operands[1]) {
1214 vector_elements = MAX2(vector_elements,
1215 ir->operands[1]->type->vector_elements);
1216 }
1217
1218 this->result.file = BAD_FILE;
1219
1220 /* Storage for our result. Ideally for an assignment we'd be using
1221 * the actual storage for the result here, instead.
1222 */
1223 result_src = src_reg(this, ir->type);
1224 /* convenience for the emit functions below. */
1225 result_dst = dst_reg(result_src);
1226 /* If nothing special happens, this is the result. */
1227 this->result = result_src;
1228 /* Limit writes to the channels that will be used by result_src later.
1229 * This does limit this temp's use as a temporary for multi-instruction
1230 * sequences.
1231 */
1232 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1233
1234 switch (ir->operation) {
1235 case ir_unop_logic_not:
1236 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1237 * ones complement of the whole register, not just bit 0.
1238 */
1239 emit(XOR(result_dst, op[0], src_reg(1)));
1240 break;
1241 case ir_unop_neg:
1242 op[0].negate = !op[0].negate;
1243 emit(MOV(result_dst, op[0]));
1244 break;
1245 case ir_unop_abs:
1246 op[0].abs = true;
1247 op[0].negate = false;
1248 emit(MOV(result_dst, op[0]));
1249 break;
1250
1251 case ir_unop_sign:
1252 if (ir->type->is_float()) {
1253 /* AND(val, 0x80000000) gives the sign bit.
1254 *
1255 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1256 * zero.
1257 */
1258 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1259
1260 op[0].type = BRW_REGISTER_TYPE_UD;
1261 result_dst.type = BRW_REGISTER_TYPE_UD;
1262 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1263
1264 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1265 inst->predicate = BRW_PREDICATE_NORMAL;
1266
1267 this->result.type = BRW_REGISTER_TYPE_F;
1268 } else {
1269 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1270 * -> non-negative val generates 0x00000000.
1271 * Predicated OR sets 1 if val is positive.
1272 */
1273 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1274
1275 emit(ASR(result_dst, op[0], src_reg(31)));
1276
1277 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1278 inst->predicate = BRW_PREDICATE_NORMAL;
1279 }
1280 break;
1281
1282 case ir_unop_rcp:
1283 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1284 break;
1285
1286 case ir_unop_exp2:
1287 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1288 break;
1289 case ir_unop_log2:
1290 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1291 break;
1292 case ir_unop_exp:
1293 case ir_unop_log:
1294 assert(!"not reached: should be handled by ir_explog_to_explog2");
1295 break;
1296 case ir_unop_sin:
1297 case ir_unop_sin_reduced:
1298 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1299 break;
1300 case ir_unop_cos:
1301 case ir_unop_cos_reduced:
1302 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1303 break;
1304
1305 case ir_unop_dFdx:
1306 case ir_unop_dFdy:
1307 assert(!"derivatives not valid in vertex shader");
1308 break;
1309
1310 case ir_unop_bitfield_reverse:
1311 emit(BFREV(result_dst, op[0]));
1312 break;
1313 case ir_unop_bit_count:
1314 emit(CBIT(result_dst, op[0]));
1315 break;
1316 case ir_unop_find_msb: {
1317 src_reg temp = src_reg(this, glsl_type::uint_type);
1318
1319 inst = emit(FBH(dst_reg(temp), op[0]));
1320 inst->dst.writemask = WRITEMASK_XYZW;
1321
1322 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1323 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1324 * subtract the result from 31 to convert the MSB count into an LSB count.
1325 */
1326
1327 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1328 temp.swizzle = BRW_SWIZZLE_NOOP;
1329 emit(MOV(result_dst, temp));
1330
1331 src_reg src_tmp = src_reg(result_dst);
1332 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1333
1334 src_tmp.negate = true;
1335 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1336 inst->predicate = BRW_PREDICATE_NORMAL;
1337 break;
1338 }
1339 case ir_unop_find_lsb:
1340 emit(FBL(result_dst, op[0]));
1341 break;
1342
1343 case ir_unop_noise:
1344 assert(!"not reached: should be handled by lower_noise");
1345 break;
1346
1347 case ir_binop_add:
1348 emit(ADD(result_dst, op[0], op[1]));
1349 break;
1350 case ir_binop_sub:
1351 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1352 break;
1353
1354 case ir_binop_mul:
1355 if (brw->gen < 8 && ir->type->is_integer()) {
1356 /* For integer multiplication, the MUL uses the low 16 bits of one of
1357 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1358 * accumulates in the contribution of the upper 16 bits of that
1359 * operand. If we can determine that one of the args is in the low
1360 * 16 bits, though, we can just emit a single MUL.
1361 */
1362 if (ir->operands[0]->is_uint16_constant()) {
1363 if (brw->gen < 7)
1364 emit(MUL(result_dst, op[0], op[1]));
1365 else
1366 emit(MUL(result_dst, op[1], op[0]));
1367 } else if (ir->operands[1]->is_uint16_constant()) {
1368 if (brw->gen < 7)
1369 emit(MUL(result_dst, op[1], op[0]));
1370 else
1371 emit(MUL(result_dst, op[0], op[1]));
1372 } else {
1373 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1374
1375 emit(MUL(acc, op[0], op[1]));
1376 emit(MACH(dst_null_d(), op[0], op[1]));
1377 emit(MOV(result_dst, src_reg(acc)));
1378 }
1379 } else {
1380 emit(MUL(result_dst, op[0], op[1]));
1381 }
1382 break;
1383 case ir_binop_imul_high: {
1384 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1385
1386 emit(MUL(acc, op[0], op[1]));
1387 emit(MACH(result_dst, op[0], op[1]));
1388 break;
1389 }
1390 case ir_binop_div:
1391 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1392 assert(ir->type->is_integer());
1393 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1394 break;
1395 case ir_binop_carry: {
1396 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1397
1398 emit(ADDC(dst_null_ud(), op[0], op[1]));
1399 emit(MOV(result_dst, src_reg(acc)));
1400 break;
1401 }
1402 case ir_binop_borrow: {
1403 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1404
1405 emit(SUBB(dst_null_ud(), op[0], op[1]));
1406 emit(MOV(result_dst, src_reg(acc)));
1407 break;
1408 }
1409 case ir_binop_mod:
1410 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1411 assert(ir->type->is_integer());
1412 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1413 break;
1414
1415 case ir_binop_less:
1416 case ir_binop_greater:
1417 case ir_binop_lequal:
1418 case ir_binop_gequal:
1419 case ir_binop_equal:
1420 case ir_binop_nequal: {
1421 emit(CMP(result_dst, op[0], op[1],
1422 brw_conditional_for_comparison(ir->operation)));
1423 emit(AND(result_dst, result_src, src_reg(0x1)));
1424 break;
1425 }
1426
1427 case ir_binop_all_equal:
1428 /* "==" operator producing a scalar boolean. */
1429 if (ir->operands[0]->type->is_vector() ||
1430 ir->operands[1]->type->is_vector()) {
1431 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1432 emit(MOV(result_dst, src_reg(0)));
1433 inst = emit(MOV(result_dst, src_reg(1)));
1434 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1435 } else {
1436 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1437 emit(AND(result_dst, result_src, src_reg(0x1)));
1438 }
1439 break;
1440 case ir_binop_any_nequal:
1441 /* "!=" operator producing a scalar boolean. */
1442 if (ir->operands[0]->type->is_vector() ||
1443 ir->operands[1]->type->is_vector()) {
1444 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1445
1446 emit(MOV(result_dst, src_reg(0)));
1447 inst = emit(MOV(result_dst, src_reg(1)));
1448 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1449 } else {
1450 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1451 emit(AND(result_dst, result_src, src_reg(0x1)));
1452 }
1453 break;
1454
1455 case ir_unop_any:
1456 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1457 emit(MOV(result_dst, src_reg(0)));
1458
1459 inst = emit(MOV(result_dst, src_reg(1)));
1460 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1461 break;
1462
1463 case ir_binop_logic_xor:
1464 emit(XOR(result_dst, op[0], op[1]));
1465 break;
1466
1467 case ir_binop_logic_or:
1468 emit(OR(result_dst, op[0], op[1]));
1469 break;
1470
1471 case ir_binop_logic_and:
1472 emit(AND(result_dst, op[0], op[1]));
1473 break;
1474
1475 case ir_binop_dot:
1476 assert(ir->operands[0]->type->is_vector());
1477 assert(ir->operands[0]->type == ir->operands[1]->type);
1478 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1479 break;
1480
1481 case ir_unop_sqrt:
1482 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1483 break;
1484 case ir_unop_rsq:
1485 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1486 break;
1487
1488 case ir_unop_bitcast_i2f:
1489 case ir_unop_bitcast_u2f:
1490 this->result = op[0];
1491 this->result.type = BRW_REGISTER_TYPE_F;
1492 break;
1493
1494 case ir_unop_bitcast_f2i:
1495 this->result = op[0];
1496 this->result.type = BRW_REGISTER_TYPE_D;
1497 break;
1498
1499 case ir_unop_bitcast_f2u:
1500 this->result = op[0];
1501 this->result.type = BRW_REGISTER_TYPE_UD;
1502 break;
1503
1504 case ir_unop_i2f:
1505 case ir_unop_i2u:
1506 case ir_unop_u2i:
1507 case ir_unop_u2f:
1508 case ir_unop_b2f:
1509 case ir_unop_b2i:
1510 case ir_unop_f2i:
1511 case ir_unop_f2u:
1512 emit(MOV(result_dst, op[0]));
1513 break;
1514 case ir_unop_f2b:
1515 case ir_unop_i2b: {
1516 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1517 emit(AND(result_dst, result_src, src_reg(1)));
1518 break;
1519 }
1520
1521 case ir_unop_trunc:
1522 emit(RNDZ(result_dst, op[0]));
1523 break;
1524 case ir_unop_ceil:
1525 op[0].negate = !op[0].negate;
1526 inst = emit(RNDD(result_dst, op[0]));
1527 this->result.negate = true;
1528 break;
1529 case ir_unop_floor:
1530 inst = emit(RNDD(result_dst, op[0]));
1531 break;
1532 case ir_unop_fract:
1533 inst = emit(FRC(result_dst, op[0]));
1534 break;
1535 case ir_unop_round_even:
1536 emit(RNDE(result_dst, op[0]));
1537 break;
1538
1539 case ir_binop_min:
1540 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1541 break;
1542 case ir_binop_max:
1543 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1544 break;
1545
1546 case ir_binop_pow:
1547 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1548 break;
1549
1550 case ir_unop_bit_not:
1551 inst = emit(NOT(result_dst, op[0]));
1552 break;
1553 case ir_binop_bit_and:
1554 inst = emit(AND(result_dst, op[0], op[1]));
1555 break;
1556 case ir_binop_bit_xor:
1557 inst = emit(XOR(result_dst, op[0], op[1]));
1558 break;
1559 case ir_binop_bit_or:
1560 inst = emit(OR(result_dst, op[0], op[1]));
1561 break;
1562
1563 case ir_binop_lshift:
1564 inst = emit(SHL(result_dst, op[0], op[1]));
1565 break;
1566
1567 case ir_binop_rshift:
1568 if (ir->type->base_type == GLSL_TYPE_INT)
1569 inst = emit(ASR(result_dst, op[0], op[1]));
1570 else
1571 inst = emit(SHR(result_dst, op[0], op[1]));
1572 break;
1573
1574 case ir_binop_bfm:
1575 emit(BFI1(result_dst, op[0], op[1]));
1576 break;
1577
1578 case ir_binop_ubo_load: {
1579 ir_constant *uniform_block = ir->operands[0]->as_constant();
1580 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1581 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1582 src_reg offset;
1583
1584 /* Now, load the vector from that offset. */
1585 assert(ir->type->is_vector() || ir->type->is_scalar());
1586
1587 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1588 packed_consts.type = result.type;
1589 src_reg surf_index =
1590 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1591 if (const_offset_ir) {
1592 if (brw->gen >= 8) {
1593 /* Store the offset in a GRF so we can send-from-GRF. */
1594 offset = src_reg(this, glsl_type::int_type);
1595 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1596 } else {
1597 /* Immediates are fine on older generations since they'll be moved
1598 * to a (potentially fake) MRF at the generator level.
1599 */
1600 offset = src_reg(const_offset / 16);
1601 }
1602 } else {
1603 offset = src_reg(this, glsl_type::uint_type);
1604 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1605 }
1606
1607 if (brw->gen >= 7) {
1608 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1609 grf_offset.type = offset.type;
1610
1611 emit(MOV(grf_offset, offset));
1612
1613 emit(new(mem_ctx) vec4_instruction(this,
1614 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1615 dst_reg(packed_consts),
1616 surf_index,
1617 src_reg(grf_offset)));
1618 } else {
1619 vec4_instruction *pull =
1620 emit(new(mem_ctx) vec4_instruction(this,
1621 VS_OPCODE_PULL_CONSTANT_LOAD,
1622 dst_reg(packed_consts),
1623 surf_index,
1624 offset));
1625 pull->base_mrf = 14;
1626 pull->mlen = 1;
1627 }
1628
1629 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1630 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1631 const_offset % 16 / 4,
1632 const_offset % 16 / 4,
1633 const_offset % 16 / 4);
1634
1635 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1636 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1637 emit(CMP(result_dst, packed_consts, src_reg(0u),
1638 BRW_CONDITIONAL_NZ));
1639 emit(AND(result_dst, result, src_reg(0x1)));
1640 } else {
1641 emit(MOV(result_dst, packed_consts));
1642 }
1643 break;
1644 }
1645
1646 case ir_binop_vector_extract:
1647 assert(!"should have been lowered by vec_index_to_cond_assign");
1648 break;
1649
1650 case ir_triop_fma:
1651 op[0] = fix_3src_operand(op[0]);
1652 op[1] = fix_3src_operand(op[1]);
1653 op[2] = fix_3src_operand(op[2]);
1654 /* Note that the instruction's argument order is reversed from GLSL
1655 * and the IR.
1656 */
1657 emit(MAD(result_dst, op[2], op[1], op[0]));
1658 break;
1659
1660 case ir_triop_lrp:
1661 emit_lrp(result_dst, op[0], op[1], op[2]);
1662 break;
1663
1664 case ir_triop_csel:
1665 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1666 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1667 inst->predicate = BRW_PREDICATE_NORMAL;
1668 break;
1669
1670 case ir_triop_bfi:
1671 op[0] = fix_3src_operand(op[0]);
1672 op[1] = fix_3src_operand(op[1]);
1673 op[2] = fix_3src_operand(op[2]);
1674 emit(BFI2(result_dst, op[0], op[1], op[2]));
1675 break;
1676
1677 case ir_triop_bitfield_extract:
1678 op[0] = fix_3src_operand(op[0]);
1679 op[1] = fix_3src_operand(op[1]);
1680 op[2] = fix_3src_operand(op[2]);
1681 /* Note that the instruction's argument order is reversed from GLSL
1682 * and the IR.
1683 */
1684 emit(BFE(result_dst, op[2], op[1], op[0]));
1685 break;
1686
1687 case ir_triop_vector_insert:
1688 assert(!"should have been lowered by lower_vector_insert");
1689 break;
1690
1691 case ir_quadop_bitfield_insert:
1692 assert(!"not reached: should be handled by "
1693 "bitfield_insert_to_bfm_bfi\n");
1694 break;
1695
1696 case ir_quadop_vector:
1697 assert(!"not reached: should be handled by lower_quadop_vector");
1698 break;
1699
1700 case ir_unop_pack_half_2x16:
1701 emit_pack_half_2x16(result_dst, op[0]);
1702 break;
1703 case ir_unop_unpack_half_2x16:
1704 emit_unpack_half_2x16(result_dst, op[0]);
1705 break;
1706 case ir_unop_pack_snorm_2x16:
1707 case ir_unop_pack_snorm_4x8:
1708 case ir_unop_pack_unorm_2x16:
1709 case ir_unop_pack_unorm_4x8:
1710 case ir_unop_unpack_snorm_2x16:
1711 case ir_unop_unpack_snorm_4x8:
1712 case ir_unop_unpack_unorm_2x16:
1713 case ir_unop_unpack_unorm_4x8:
1714 assert(!"not reached: should be handled by lower_packing_builtins");
1715 break;
1716 case ir_unop_unpack_half_2x16_split_x:
1717 case ir_unop_unpack_half_2x16_split_y:
1718 case ir_binop_pack_half_2x16_split:
1719 assert(!"not reached: should not occur in vertex shader");
1720 break;
1721 case ir_binop_ldexp:
1722 assert(!"not reached: should be handled by ldexp_to_arith()");
1723 break;
1724 }
1725 }
1726
1727
1728 void
1729 vec4_visitor::visit(ir_swizzle *ir)
1730 {
1731 src_reg src;
1732 int i = 0;
1733 int swizzle[4];
1734
1735 /* Note that this is only swizzles in expressions, not those on the left
1736 * hand side of an assignment, which do write masking. See ir_assignment
1737 * for that.
1738 */
1739
1740 ir->val->accept(this);
1741 src = this->result;
1742 assert(src.file != BAD_FILE);
1743
1744 for (i = 0; i < ir->type->vector_elements; i++) {
1745 switch (i) {
1746 case 0:
1747 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1748 break;
1749 case 1:
1750 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1751 break;
1752 case 2:
1753 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1754 break;
1755 case 3:
1756 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1757 break;
1758 }
1759 }
1760 for (; i < 4; i++) {
1761 /* Replicate the last channel out. */
1762 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1763 }
1764
1765 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1766
1767 this->result = src;
1768 }
1769
1770 void
1771 vec4_visitor::visit(ir_dereference_variable *ir)
1772 {
1773 const struct glsl_type *type = ir->type;
1774 dst_reg *reg = variable_storage(ir->var);
1775
1776 if (!reg) {
1777 fail("Failed to find variable storage for %s\n", ir->var->name);
1778 this->result = src_reg(brw_null_reg());
1779 return;
1780 }
1781
1782 this->result = src_reg(*reg);
1783
1784 /* System values get their swizzle from the dst_reg writemask */
1785 if (ir->var->data.mode == ir_var_system_value)
1786 return;
1787
1788 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1789 this->result.swizzle = swizzle_for_size(type->vector_elements);
1790 }
1791
1792
1793 int
1794 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1795 {
1796 /* Under normal circumstances array elements are stored consecutively, so
1797 * the stride is equal to the size of the array element.
1798 */
1799 return type_size(ir->type);
1800 }
1801
1802
1803 void
1804 vec4_visitor::visit(ir_dereference_array *ir)
1805 {
1806 ir_constant *constant_index;
1807 src_reg src;
1808 int array_stride = compute_array_stride(ir);
1809
1810 constant_index = ir->array_index->constant_expression_value();
1811
1812 ir->array->accept(this);
1813 src = this->result;
1814
1815 if (constant_index) {
1816 src.reg_offset += constant_index->value.i[0] * array_stride;
1817 } else {
1818 /* Variable index array dereference. It eats the "vec4" of the
1819 * base of the array and an index that offsets the Mesa register
1820 * index.
1821 */
1822 ir->array_index->accept(this);
1823
1824 src_reg index_reg;
1825
1826 if (array_stride == 1) {
1827 index_reg = this->result;
1828 } else {
1829 index_reg = src_reg(this, glsl_type::int_type);
1830
1831 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1832 }
1833
1834 if (src.reladdr) {
1835 src_reg temp = src_reg(this, glsl_type::int_type);
1836
1837 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1838
1839 index_reg = temp;
1840 }
1841
1842 src.reladdr = ralloc(mem_ctx, src_reg);
1843 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1844 }
1845
1846 /* If the type is smaller than a vec4, replicate the last channel out. */
1847 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1848 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1849 else
1850 src.swizzle = BRW_SWIZZLE_NOOP;
1851 src.type = brw_type_for_base_type(ir->type);
1852
1853 this->result = src;
1854 }
1855
1856 void
1857 vec4_visitor::visit(ir_dereference_record *ir)
1858 {
1859 unsigned int i;
1860 const glsl_type *struct_type = ir->record->type;
1861 int offset = 0;
1862
1863 ir->record->accept(this);
1864
1865 for (i = 0; i < struct_type->length; i++) {
1866 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1867 break;
1868 offset += type_size(struct_type->fields.structure[i].type);
1869 }
1870
1871 /* If the type is smaller than a vec4, replicate the last channel out. */
1872 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1873 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1874 else
1875 this->result.swizzle = BRW_SWIZZLE_NOOP;
1876 this->result.type = brw_type_for_base_type(ir->type);
1877
1878 this->result.reg_offset += offset;
1879 }
1880
1881 /**
1882 * We want to be careful in assignment setup to hit the actual storage
1883 * instead of potentially using a temporary like we might with the
1884 * ir_dereference handler.
1885 */
1886 static dst_reg
1887 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1888 {
1889 /* The LHS must be a dereference. If the LHS is a variable indexed array
1890 * access of a vector, it must be separated into a series conditional moves
1891 * before reaching this point (see ir_vec_index_to_cond_assign).
1892 */
1893 assert(ir->as_dereference());
1894 ir_dereference_array *deref_array = ir->as_dereference_array();
1895 if (deref_array) {
1896 assert(!deref_array->array->type->is_vector());
1897 }
1898
1899 /* Use the rvalue deref handler for the most part. We'll ignore
1900 * swizzles in it and write swizzles using writemask, though.
1901 */
1902 ir->accept(v);
1903 return dst_reg(v->result);
1904 }
1905
1906 void
1907 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1908 const struct glsl_type *type, uint32_t predicate)
1909 {
1910 if (type->base_type == GLSL_TYPE_STRUCT) {
1911 for (unsigned int i = 0; i < type->length; i++) {
1912 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1913 }
1914 return;
1915 }
1916
1917 if (type->is_array()) {
1918 for (unsigned int i = 0; i < type->length; i++) {
1919 emit_block_move(dst, src, type->fields.array, predicate);
1920 }
1921 return;
1922 }
1923
1924 if (type->is_matrix()) {
1925 const struct glsl_type *vec_type;
1926
1927 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1928 type->vector_elements, 1);
1929
1930 for (int i = 0; i < type->matrix_columns; i++) {
1931 emit_block_move(dst, src, vec_type, predicate);
1932 }
1933 return;
1934 }
1935
1936 assert(type->is_scalar() || type->is_vector());
1937
1938 dst->type = brw_type_for_base_type(type);
1939 src->type = dst->type;
1940
1941 dst->writemask = (1 << type->vector_elements) - 1;
1942
1943 src->swizzle = swizzle_for_size(type->vector_elements);
1944
1945 vec4_instruction *inst = emit(MOV(*dst, *src));
1946 inst->predicate = predicate;
1947
1948 dst->reg_offset++;
1949 src->reg_offset++;
1950 }
1951
1952
1953 /* If the RHS processing resulted in an instruction generating a
1954 * temporary value, and it would be easy to rewrite the instruction to
1955 * generate its result right into the LHS instead, do so. This ends
1956 * up reliably removing instructions where it can be tricky to do so
1957 * later without real UD chain information.
1958 */
1959 bool
1960 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1961 dst_reg dst,
1962 src_reg src,
1963 vec4_instruction *pre_rhs_inst,
1964 vec4_instruction *last_rhs_inst)
1965 {
1966 /* This could be supported, but it would take more smarts. */
1967 if (ir->condition)
1968 return false;
1969
1970 if (pre_rhs_inst == last_rhs_inst)
1971 return false; /* No instructions generated to work with. */
1972
1973 /* Make sure the last instruction generated our source reg. */
1974 if (src.file != GRF ||
1975 src.file != last_rhs_inst->dst.file ||
1976 src.reg != last_rhs_inst->dst.reg ||
1977 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1978 src.reladdr ||
1979 src.abs ||
1980 src.negate ||
1981 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1982 return false;
1983
1984 /* Check that that last instruction fully initialized the channels
1985 * we want to use, in the order we want to use them. We could
1986 * potentially reswizzle the operands of many instructions so that
1987 * we could handle out of order channels, but don't yet.
1988 */
1989
1990 for (unsigned i = 0; i < 4; i++) {
1991 if (dst.writemask & (1 << i)) {
1992 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1993 return false;
1994
1995 if (BRW_GET_SWZ(src.swizzle, i) != i)
1996 return false;
1997 }
1998 }
1999
2000 /* Success! Rewrite the instruction. */
2001 last_rhs_inst->dst.file = dst.file;
2002 last_rhs_inst->dst.reg = dst.reg;
2003 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2004 last_rhs_inst->dst.reladdr = dst.reladdr;
2005 last_rhs_inst->dst.writemask &= dst.writemask;
2006
2007 return true;
2008 }
2009
2010 void
2011 vec4_visitor::visit(ir_assignment *ir)
2012 {
2013 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2014 uint32_t predicate = BRW_PREDICATE_NONE;
2015
2016 if (!ir->lhs->type->is_scalar() &&
2017 !ir->lhs->type->is_vector()) {
2018 ir->rhs->accept(this);
2019 src_reg src = this->result;
2020
2021 if (ir->condition) {
2022 emit_bool_to_cond_code(ir->condition, &predicate);
2023 }
2024
2025 /* emit_block_move doesn't account for swizzles in the source register.
2026 * This should be ok, since the source register is a structure or an
2027 * array, and those can't be swizzled. But double-check to be sure.
2028 */
2029 assert(src.swizzle ==
2030 (ir->rhs->type->is_matrix()
2031 ? swizzle_for_size(ir->rhs->type->vector_elements)
2032 : BRW_SWIZZLE_NOOP));
2033
2034 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2035 return;
2036 }
2037
2038 /* Now we're down to just a scalar/vector with writemasks. */
2039 int i;
2040
2041 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2042 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2043
2044 ir->rhs->accept(this);
2045
2046 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2047
2048 src_reg src = this->result;
2049
2050 int swizzles[4];
2051 int first_enabled_chan = 0;
2052 int src_chan = 0;
2053
2054 assert(ir->lhs->type->is_vector() ||
2055 ir->lhs->type->is_scalar());
2056 dst.writemask = ir->write_mask;
2057
2058 for (int i = 0; i < 4; i++) {
2059 if (dst.writemask & (1 << i)) {
2060 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2061 break;
2062 }
2063 }
2064
2065 /* Swizzle a small RHS vector into the channels being written.
2066 *
2067 * glsl ir treats write_mask as dictating how many channels are
2068 * present on the RHS while in our instructions we need to make
2069 * those channels appear in the slots of the vec4 they're written to.
2070 */
2071 for (int i = 0; i < 4; i++) {
2072 if (dst.writemask & (1 << i))
2073 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2074 else
2075 swizzles[i] = first_enabled_chan;
2076 }
2077 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2078 swizzles[2], swizzles[3]);
2079
2080 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2081 return;
2082 }
2083
2084 if (ir->condition) {
2085 emit_bool_to_cond_code(ir->condition, &predicate);
2086 }
2087
2088 for (i = 0; i < type_size(ir->lhs->type); i++) {
2089 vec4_instruction *inst = emit(MOV(dst, src));
2090 inst->predicate = predicate;
2091
2092 dst.reg_offset++;
2093 src.reg_offset++;
2094 }
2095 }
2096
2097 void
2098 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2099 {
2100 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2101 foreach_list(node, &ir->components) {
2102 ir_constant *field_value = (ir_constant *)node;
2103
2104 emit_constant_values(dst, field_value);
2105 }
2106 return;
2107 }
2108
2109 if (ir->type->is_array()) {
2110 for (unsigned int i = 0; i < ir->type->length; i++) {
2111 emit_constant_values(dst, ir->array_elements[i]);
2112 }
2113 return;
2114 }
2115
2116 if (ir->type->is_matrix()) {
2117 for (int i = 0; i < ir->type->matrix_columns; i++) {
2118 float *vec = &ir->value.f[i * ir->type->vector_elements];
2119
2120 for (int j = 0; j < ir->type->vector_elements; j++) {
2121 dst->writemask = 1 << j;
2122 dst->type = BRW_REGISTER_TYPE_F;
2123
2124 emit(MOV(*dst, src_reg(vec[j])));
2125 }
2126 dst->reg_offset++;
2127 }
2128 return;
2129 }
2130
2131 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2132
2133 for (int i = 0; i < ir->type->vector_elements; i++) {
2134 if (!(remaining_writemask & (1 << i)))
2135 continue;
2136
2137 dst->writemask = 1 << i;
2138 dst->type = brw_type_for_base_type(ir->type);
2139
2140 /* Find other components that match the one we're about to
2141 * write. Emits fewer instructions for things like vec4(0.5,
2142 * 1.5, 1.5, 1.5).
2143 */
2144 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2145 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2146 if (ir->value.b[i] == ir->value.b[j])
2147 dst->writemask |= (1 << j);
2148 } else {
2149 /* u, i, and f storage all line up, so no need for a
2150 * switch case for comparing each type.
2151 */
2152 if (ir->value.u[i] == ir->value.u[j])
2153 dst->writemask |= (1 << j);
2154 }
2155 }
2156
2157 switch (ir->type->base_type) {
2158 case GLSL_TYPE_FLOAT:
2159 emit(MOV(*dst, src_reg(ir->value.f[i])));
2160 break;
2161 case GLSL_TYPE_INT:
2162 emit(MOV(*dst, src_reg(ir->value.i[i])));
2163 break;
2164 case GLSL_TYPE_UINT:
2165 emit(MOV(*dst, src_reg(ir->value.u[i])));
2166 break;
2167 case GLSL_TYPE_BOOL:
2168 emit(MOV(*dst, src_reg(ir->value.b[i])));
2169 break;
2170 default:
2171 assert(!"Non-float/uint/int/bool constant");
2172 break;
2173 }
2174
2175 remaining_writemask &= ~dst->writemask;
2176 }
2177 dst->reg_offset++;
2178 }
2179
2180 void
2181 vec4_visitor::visit(ir_constant *ir)
2182 {
2183 dst_reg dst = dst_reg(this, ir->type);
2184 this->result = src_reg(dst);
2185
2186 emit_constant_values(&dst, ir);
2187 }
2188
2189 void
2190 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2191 {
2192 ir_dereference *deref = static_cast<ir_dereference *>(
2193 ir->actual_parameters.get_head());
2194 ir_variable *location = deref->variable_referenced();
2195 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2196 location->data.atomic.buffer_index);
2197
2198 /* Calculate the surface offset */
2199 src_reg offset(this, glsl_type::uint_type);
2200 ir_dereference_array *deref_array = deref->as_dereference_array();
2201 if (deref_array) {
2202 deref_array->array_index->accept(this);
2203
2204 src_reg tmp(this, glsl_type::uint_type);
2205 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2206 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2207 } else {
2208 offset = location->data.atomic.offset;
2209 }
2210
2211 /* Emit the appropriate machine instruction */
2212 const char *callee = ir->callee->function_name();
2213 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2214
2215 if (!strcmp("__intrinsic_atomic_read", callee)) {
2216 emit_untyped_surface_read(surf_index, dst, offset);
2217
2218 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2219 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2220 src_reg(), src_reg());
2221
2222 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2223 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2224 src_reg(), src_reg());
2225 }
2226 }
2227
2228 void
2229 vec4_visitor::visit(ir_call *ir)
2230 {
2231 const char *callee = ir->callee->function_name();
2232
2233 if (!strcmp("__intrinsic_atomic_read", callee) ||
2234 !strcmp("__intrinsic_atomic_increment", callee) ||
2235 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2236 visit_atomic_counter_intrinsic(ir);
2237 } else {
2238 assert(!"Unsupported intrinsic.");
2239 }
2240 }
2241
2242 src_reg
2243 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2244 {
2245 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2246 inst->base_mrf = 2;
2247 inst->mlen = 1;
2248 inst->sampler = sampler;
2249 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2250 inst->dst.writemask = WRITEMASK_XYZW;
2251
2252 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2253 int param_base = inst->base_mrf;
2254 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2255 int zero_mask = 0xf & ~coord_mask;
2256
2257 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2258 coordinate));
2259
2260 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2261 src_reg(0)));
2262
2263 emit(inst);
2264 return src_reg(inst->dst);
2265 }
2266
2267 void
2268 vec4_visitor::visit(ir_texture *ir)
2269 {
2270 int sampler =
2271 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2272
2273 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2274 * emitting anything other than setting up the constant result.
2275 */
2276 if (ir->op == ir_tg4) {
2277 ir_constant *chan = ir->lod_info.component->as_constant();
2278 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2279 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2280 dst_reg result(this, ir->type);
2281 this->result = src_reg(result);
2282 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2283 return;
2284 }
2285 }
2286
2287 /* Should be lowered by do_lower_texture_projection */
2288 assert(!ir->projector);
2289
2290 /* Should be lowered */
2291 assert(!ir->offset || !ir->offset->type->is_array());
2292
2293 /* Generate code to compute all the subexpression trees. This has to be
2294 * done before loading any values into MRFs for the sampler message since
2295 * generating these values may involve SEND messages that need the MRFs.
2296 */
2297 src_reg coordinate;
2298 if (ir->coordinate) {
2299 ir->coordinate->accept(this);
2300 coordinate = this->result;
2301 }
2302
2303 src_reg shadow_comparitor;
2304 if (ir->shadow_comparitor) {
2305 ir->shadow_comparitor->accept(this);
2306 shadow_comparitor = this->result;
2307 }
2308
2309 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2310 src_reg offset_value;
2311 if (has_nonconstant_offset) {
2312 ir->offset->accept(this);
2313 offset_value = src_reg(this->result);
2314 }
2315
2316 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2317 src_reg lod, dPdx, dPdy, sample_index, mcs;
2318 switch (ir->op) {
2319 case ir_tex:
2320 lod = src_reg(0.0f);
2321 lod_type = glsl_type::float_type;
2322 break;
2323 case ir_txf:
2324 case ir_txl:
2325 case ir_txs:
2326 ir->lod_info.lod->accept(this);
2327 lod = this->result;
2328 lod_type = ir->lod_info.lod->type;
2329 break;
2330 case ir_query_levels:
2331 lod = src_reg(0);
2332 lod_type = glsl_type::int_type;
2333 break;
2334 case ir_txf_ms:
2335 ir->lod_info.sample_index->accept(this);
2336 sample_index = this->result;
2337 sample_index_type = ir->lod_info.sample_index->type;
2338
2339 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2340 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2341 else
2342 mcs = src_reg(0u);
2343 break;
2344 case ir_txd:
2345 ir->lod_info.grad.dPdx->accept(this);
2346 dPdx = this->result;
2347
2348 ir->lod_info.grad.dPdy->accept(this);
2349 dPdy = this->result;
2350
2351 lod_type = ir->lod_info.grad.dPdx->type;
2352 break;
2353 case ir_txb:
2354 case ir_lod:
2355 case ir_tg4:
2356 break;
2357 }
2358
2359 vec4_instruction *inst = NULL;
2360 switch (ir->op) {
2361 case ir_tex:
2362 case ir_txl:
2363 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2364 break;
2365 case ir_txd:
2366 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2367 break;
2368 case ir_txf:
2369 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2370 break;
2371 case ir_txf_ms:
2372 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2373 break;
2374 case ir_txs:
2375 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2376 break;
2377 case ir_tg4:
2378 if (has_nonconstant_offset)
2379 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2380 else
2381 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2382 break;
2383 case ir_query_levels:
2384 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2385 break;
2386 case ir_txb:
2387 assert(!"TXB is not valid for vertex shaders.");
2388 break;
2389 case ir_lod:
2390 assert(!"LOD is not valid for vertex shaders.");
2391 break;
2392 default:
2393 assert(!"Unrecognized tex op");
2394 }
2395
2396 if (ir->offset != NULL && ir->op != ir_txf)
2397 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2398
2399 /* Stuff the channel select bits in the top of the texture offset */
2400 if (ir->op == ir_tg4)
2401 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2402
2403 /* The message header is necessary for:
2404 * - Gen4 (always)
2405 * - Texel offsets
2406 * - Gather channel selection
2407 * - Sampler indices too large to fit in a 4-bit value.
2408 */
2409 inst->header_present =
2410 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2411 sampler >= 16;
2412 inst->base_mrf = 2;
2413 inst->mlen = inst->header_present + 1; /* always at least one */
2414 inst->sampler = sampler;
2415 inst->dst = dst_reg(this, ir->type);
2416 inst->dst.writemask = WRITEMASK_XYZW;
2417 inst->shadow_compare = ir->shadow_comparitor != NULL;
2418
2419 /* MRF for the first parameter */
2420 int param_base = inst->base_mrf + inst->header_present;
2421
2422 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2423 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2424 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2425 } else {
2426 /* Load the coordinate */
2427 /* FINISHME: gl_clamp_mask and saturate */
2428 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2429 int zero_mask = 0xf & ~coord_mask;
2430
2431 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2432 coordinate));
2433
2434 if (zero_mask != 0) {
2435 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2436 src_reg(0)));
2437 }
2438 /* Load the shadow comparitor */
2439 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2440 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2441 WRITEMASK_X),
2442 shadow_comparitor));
2443 inst->mlen++;
2444 }
2445
2446 /* Load the LOD info */
2447 if (ir->op == ir_tex || ir->op == ir_txl) {
2448 int mrf, writemask;
2449 if (brw->gen >= 5) {
2450 mrf = param_base + 1;
2451 if (ir->shadow_comparitor) {
2452 writemask = WRITEMASK_Y;
2453 /* mlen already incremented */
2454 } else {
2455 writemask = WRITEMASK_X;
2456 inst->mlen++;
2457 }
2458 } else /* brw->gen == 4 */ {
2459 mrf = param_base;
2460 writemask = WRITEMASK_W;
2461 }
2462 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2463 } else if (ir->op == ir_txf) {
2464 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2465 } else if (ir->op == ir_txf_ms) {
2466 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2467 sample_index));
2468 if (brw->gen >= 7)
2469 /* MCS data is in the first channel of `mcs`, but we need to get it into
2470 * the .y channel of the second vec4 of params, so replicate .x across
2471 * the whole vec4 and then mask off everything except .y
2472 */
2473 mcs.swizzle = BRW_SWIZZLE_XXXX;
2474 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2475 mcs));
2476 inst->mlen++;
2477 } else if (ir->op == ir_txd) {
2478 const glsl_type *type = lod_type;
2479
2480 if (brw->gen >= 5) {
2481 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2482 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2483 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2484 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2485 inst->mlen++;
2486
2487 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2488 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2489 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2490 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2491 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2492 inst->mlen++;
2493
2494 if (ir->shadow_comparitor) {
2495 emit(MOV(dst_reg(MRF, param_base + 2,
2496 ir->shadow_comparitor->type, WRITEMASK_Z),
2497 shadow_comparitor));
2498 }
2499 }
2500 } else /* brw->gen == 4 */ {
2501 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2502 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2503 inst->mlen += 2;
2504 }
2505 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2506 if (ir->shadow_comparitor) {
2507 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2508 shadow_comparitor));
2509 }
2510
2511 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2512 offset_value));
2513 inst->mlen++;
2514 }
2515 }
2516
2517 emit(inst);
2518
2519 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2520 * spec requires layers.
2521 */
2522 if (ir->op == ir_txs) {
2523 glsl_type const *type = ir->sampler->type;
2524 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2525 type->sampler_array) {
2526 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2527 writemask(inst->dst, WRITEMASK_Z),
2528 src_reg(inst->dst), src_reg(6));
2529 }
2530 }
2531
2532 if (brw->gen == 6 && ir->op == ir_tg4) {
2533 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2534 }
2535
2536 swizzle_result(ir, src_reg(inst->dst), sampler);
2537 }
2538
2539 /**
2540 * Apply workarounds for Gen6 gather with UINT/SINT
2541 */
2542 void
2543 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2544 {
2545 if (!wa)
2546 return;
2547
2548 int width = (wa & WA_8BIT) ? 8 : 16;
2549 dst_reg dst_f = dst;
2550 dst_f.type = BRW_REGISTER_TYPE_F;
2551
2552 /* Convert from UNORM to UINT */
2553 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2554 emit(MOV(dst, src_reg(dst_f)));
2555
2556 if (wa & WA_SIGN) {
2557 /* Reinterpret the UINT value as a signed INT value by
2558 * shifting the sign bit into place, then shifting back
2559 * preserving sign.
2560 */
2561 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2562 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2563 }
2564 }
2565
2566 /**
2567 * Set up the gather channel based on the swizzle, for gather4.
2568 */
2569 uint32_t
2570 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2571 {
2572 ir_constant *chan = ir->lod_info.component->as_constant();
2573 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2574 switch (swiz) {
2575 case SWIZZLE_X: return 0;
2576 case SWIZZLE_Y:
2577 /* gather4 sampler is broken for green channel on RG32F --
2578 * we must ask for blue instead.
2579 */
2580 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2581 return 2;
2582 return 1;
2583 case SWIZZLE_Z: return 2;
2584 case SWIZZLE_W: return 3;
2585 default:
2586 assert(!"Not reached"); /* zero, one swizzles handled already */
2587 return 0;
2588 }
2589 }
2590
2591 void
2592 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2593 {
2594 int s = key->tex.swizzles[sampler];
2595
2596 this->result = src_reg(this, ir->type);
2597 dst_reg swizzled_result(this->result);
2598
2599 if (ir->op == ir_query_levels) {
2600 /* # levels is in .w */
2601 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2602 emit(MOV(swizzled_result, orig_val));
2603 return;
2604 }
2605
2606 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2607 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2608 emit(MOV(swizzled_result, orig_val));
2609 return;
2610 }
2611
2612
2613 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2614 int swizzle[4] = {0};
2615
2616 for (int i = 0; i < 4; i++) {
2617 switch (GET_SWZ(s, i)) {
2618 case SWIZZLE_ZERO:
2619 zero_mask |= (1 << i);
2620 break;
2621 case SWIZZLE_ONE:
2622 one_mask |= (1 << i);
2623 break;
2624 default:
2625 copy_mask |= (1 << i);
2626 swizzle[i] = GET_SWZ(s, i);
2627 break;
2628 }
2629 }
2630
2631 if (copy_mask) {
2632 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2633 swizzled_result.writemask = copy_mask;
2634 emit(MOV(swizzled_result, orig_val));
2635 }
2636
2637 if (zero_mask) {
2638 swizzled_result.writemask = zero_mask;
2639 emit(MOV(swizzled_result, src_reg(0.0f)));
2640 }
2641
2642 if (one_mask) {
2643 swizzled_result.writemask = one_mask;
2644 emit(MOV(swizzled_result, src_reg(1.0f)));
2645 }
2646 }
2647
2648 void
2649 vec4_visitor::visit(ir_return *ir)
2650 {
2651 assert(!"not reached");
2652 }
2653
2654 void
2655 vec4_visitor::visit(ir_discard *ir)
2656 {
2657 assert(!"not reached");
2658 }
2659
2660 void
2661 vec4_visitor::visit(ir_if *ir)
2662 {
2663 /* Don't point the annotation at the if statement, because then it plus
2664 * the then and else blocks get printed.
2665 */
2666 this->base_ir = ir->condition;
2667
2668 if (brw->gen == 6) {
2669 emit_if_gen6(ir);
2670 } else {
2671 uint32_t predicate;
2672 emit_bool_to_cond_code(ir->condition, &predicate);
2673 emit(IF(predicate));
2674 }
2675
2676 visit_instructions(&ir->then_instructions);
2677
2678 if (!ir->else_instructions.is_empty()) {
2679 this->base_ir = ir->condition;
2680 emit(BRW_OPCODE_ELSE);
2681
2682 visit_instructions(&ir->else_instructions);
2683 }
2684
2685 this->base_ir = ir->condition;
2686 emit(BRW_OPCODE_ENDIF);
2687 }
2688
2689 void
2690 vec4_visitor::visit(ir_emit_vertex *)
2691 {
2692 assert(!"not reached");
2693 }
2694
2695 void
2696 vec4_visitor::visit(ir_end_primitive *)
2697 {
2698 assert(!"not reached");
2699 }
2700
2701 void
2702 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2703 dst_reg dst, src_reg offset,
2704 src_reg src0, src_reg src1)
2705 {
2706 unsigned mlen = 0;
2707
2708 /* Set the atomic operation offset. */
2709 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2710 mlen++;
2711
2712 /* Set the atomic operation arguments. */
2713 if (src0.file != BAD_FILE) {
2714 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2715 mlen++;
2716 }
2717
2718 if (src1.file != BAD_FILE) {
2719 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2720 mlen++;
2721 }
2722
2723 /* Emit the instruction. Note that this maps to the normal SIMD8
2724 * untyped atomic message on Ivy Bridge, but that's OK because
2725 * unused channels will be masked out.
2726 */
2727 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2728 src_reg(atomic_op), src_reg(surf_index));
2729 inst->base_mrf = 0;
2730 inst->mlen = mlen;
2731 }
2732
2733 void
2734 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2735 src_reg offset)
2736 {
2737 /* Set the surface read offset. */
2738 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2739
2740 /* Emit the instruction. Note that this maps to the normal SIMD8
2741 * untyped surface read message, but that's OK because unused
2742 * channels will be masked out.
2743 */
2744 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2745 dst, src_reg(surf_index));
2746 inst->base_mrf = 0;
2747 inst->mlen = 1;
2748 }
2749
2750 void
2751 vec4_visitor::emit_ndc_computation()
2752 {
2753 /* Get the position */
2754 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2755
2756 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2757 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2758 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2759
2760 current_annotation = "NDC";
2761 dst_reg ndc_w = ndc;
2762 ndc_w.writemask = WRITEMASK_W;
2763 src_reg pos_w = pos;
2764 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2765 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2766
2767 dst_reg ndc_xyz = ndc;
2768 ndc_xyz.writemask = WRITEMASK_XYZ;
2769
2770 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2771 }
2772
2773 void
2774 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2775 {
2776 if (brw->gen < 6 &&
2777 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2778 key->userclip_active || brw->has_negative_rhw_bug)) {
2779 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2780 dst_reg header1_w = header1;
2781 header1_w.writemask = WRITEMASK_W;
2782
2783 emit(MOV(header1, 0u));
2784
2785 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2786 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2787
2788 current_annotation = "Point size";
2789 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2790 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2791 }
2792
2793 if (key->userclip_active) {
2794 current_annotation = "Clipping flags";
2795 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2796 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2797
2798 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2799 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2800 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2801
2802 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2803 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2804 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2805 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2806 }
2807
2808 /* i965 clipping workaround:
2809 * 1) Test for -ve rhw
2810 * 2) If set,
2811 * set ndc = (0,0,0,0)
2812 * set ucp[6] = 1
2813 *
2814 * Later, clipping will detect ucp[6] and ensure the primitive is
2815 * clipped against all fixed planes.
2816 */
2817 if (brw->has_negative_rhw_bug) {
2818 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2819 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2820 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2821 vec4_instruction *inst;
2822 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2823 inst->predicate = BRW_PREDICATE_NORMAL;
2824 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2825 inst->predicate = BRW_PREDICATE_NORMAL;
2826 }
2827
2828 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2829 } else if (brw->gen < 6) {
2830 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2831 } else {
2832 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2833 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2834 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2835 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2836 }
2837 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2838 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2839 src_reg(output_reg[VARYING_SLOT_LAYER])));
2840 }
2841 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2842 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2843 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2844 }
2845 }
2846 }
2847
2848 void
2849 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2850 {
2851 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2852 *
2853 * "If a linked set of shaders forming the vertex stage contains no
2854 * static write to gl_ClipVertex or gl_ClipDistance, but the
2855 * application has requested clipping against user clip planes through
2856 * the API, then the coordinate written to gl_Position is used for
2857 * comparison against the user clip planes."
2858 *
2859 * This function is only called if the shader didn't write to
2860 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2861 * if the user wrote to it; otherwise we use gl_Position.
2862 */
2863 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2864 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2865 clip_vertex = VARYING_SLOT_POS;
2866 }
2867
2868 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2869 ++i) {
2870 reg.writemask = 1 << i;
2871 emit(DP4(reg,
2872 src_reg(output_reg[clip_vertex]),
2873 src_reg(this->userplane[i + offset])));
2874 }
2875 }
2876
2877 void
2878 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2879 {
2880 assert (varying < VARYING_SLOT_MAX);
2881 reg.type = output_reg[varying].type;
2882 current_annotation = output_reg_annotation[varying];
2883 /* Copy the register, saturating if necessary */
2884 vec4_instruction *inst = emit(MOV(reg,
2885 src_reg(output_reg[varying])));
2886 if ((varying == VARYING_SLOT_COL0 ||
2887 varying == VARYING_SLOT_COL1 ||
2888 varying == VARYING_SLOT_BFC0 ||
2889 varying == VARYING_SLOT_BFC1) &&
2890 key->clamp_vertex_color) {
2891 inst->saturate = true;
2892 }
2893 }
2894
2895 void
2896 vec4_visitor::emit_urb_slot(int mrf, int varying)
2897 {
2898 struct brw_reg hw_reg = brw_message_reg(mrf);
2899 dst_reg reg = dst_reg(MRF, mrf);
2900 reg.type = BRW_REGISTER_TYPE_F;
2901
2902 switch (varying) {
2903 case VARYING_SLOT_PSIZ:
2904 /* PSIZ is always in slot 0, and is coupled with other flags. */
2905 current_annotation = "indices, point width, clip flags";
2906 emit_psiz_and_flags(hw_reg);
2907 break;
2908 case BRW_VARYING_SLOT_NDC:
2909 current_annotation = "NDC";
2910 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2911 break;
2912 case VARYING_SLOT_POS:
2913 current_annotation = "gl_Position";
2914 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2915 break;
2916 case VARYING_SLOT_EDGE:
2917 /* This is present when doing unfilled polygons. We're supposed to copy
2918 * the edge flag from the user-provided vertex array
2919 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2920 * of that attribute (starts as 1.0f). This is then used in clipping to
2921 * determine which edges should be drawn as wireframe.
2922 */
2923 current_annotation = "edge flag";
2924 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2925 glsl_type::float_type, WRITEMASK_XYZW))));
2926 break;
2927 case BRW_VARYING_SLOT_PAD:
2928 /* No need to write to this slot */
2929 break;
2930 default:
2931 emit_generic_urb_slot(reg, varying);
2932 break;
2933 }
2934 }
2935
2936 static int
2937 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2938 {
2939 if (brw->gen >= 6) {
2940 /* URB data written (does not include the message header reg) must
2941 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2942 * section 5.4.3.2.2: URB_INTERLEAVED.
2943 *
2944 * URB entries are allocated on a multiple of 1024 bits, so an
2945 * extra 128 bits written here to make the end align to 256 is
2946 * no problem.
2947 */
2948 if ((mlen % 2) != 1)
2949 mlen++;
2950 }
2951
2952 return mlen;
2953 }
2954
2955
2956 /**
2957 * Generates the VUE payload plus the necessary URB write instructions to
2958 * output it.
2959 *
2960 * The VUE layout is documented in Volume 2a.
2961 */
2962 void
2963 vec4_visitor::emit_vertex()
2964 {
2965 /* MRF 0 is reserved for the debugger, so start with message header
2966 * in MRF 1.
2967 */
2968 int base_mrf = 1;
2969 int mrf = base_mrf;
2970 /* In the process of generating our URB write message contents, we
2971 * may need to unspill a register or load from an array. Those
2972 * reads would use MRFs 14-15.
2973 */
2974 int max_usable_mrf = 13;
2975
2976 /* The following assertion verifies that max_usable_mrf causes an
2977 * even-numbered amount of URB write data, which will meet gen6's
2978 * requirements for length alignment.
2979 */
2980 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2981
2982 /* First mrf is the g0-based message header containing URB handles and
2983 * such.
2984 */
2985 emit_urb_write_header(mrf++);
2986
2987 if (brw->gen < 6) {
2988 emit_ndc_computation();
2989 }
2990
2991 /* Lower legacy ff and ClipVertex clipping to clip distances */
2992 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2993 current_annotation = "user clip distances";
2994
2995 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2996 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2997
2998 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2999 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3000 }
3001
3002 /* We may need to split this up into several URB writes, so do them in a
3003 * loop.
3004 */
3005 int slot = 0;
3006 bool complete = false;
3007 do {
3008 /* URB offset is in URB row increments, and each of our MRFs is half of
3009 * one of those, since we're doing interleaved writes.
3010 */
3011 int offset = slot / 2;
3012
3013 mrf = base_mrf + 1;
3014 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3015 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3016
3017 /* If this was max_usable_mrf, we can't fit anything more into this
3018 * URB WRITE.
3019 */
3020 if (mrf > max_usable_mrf) {
3021 slot++;
3022 break;
3023 }
3024 }
3025
3026 complete = slot >= prog_data->vue_map.num_slots;
3027 current_annotation = "URB write";
3028 vec4_instruction *inst = emit_urb_write_opcode(complete);
3029 inst->base_mrf = base_mrf;
3030 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3031 inst->offset += offset;
3032 } while(!complete);
3033 }
3034
3035
3036 src_reg
3037 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3038 src_reg *reladdr, int reg_offset)
3039 {
3040 /* Because we store the values to scratch interleaved like our
3041 * vertex data, we need to scale the vec4 index by 2.
3042 */
3043 int message_header_scale = 2;
3044
3045 /* Pre-gen6, the message header uses byte offsets instead of vec4
3046 * (16-byte) offset units.
3047 */
3048 if (brw->gen < 6)
3049 message_header_scale *= 16;
3050
3051 if (reladdr) {
3052 src_reg index = src_reg(this, glsl_type::int_type);
3053
3054 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3055 emit_before(inst, MUL(dst_reg(index),
3056 index, src_reg(message_header_scale)));
3057
3058 return index;
3059 } else {
3060 return src_reg(reg_offset * message_header_scale);
3061 }
3062 }
3063
3064 src_reg
3065 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3066 src_reg *reladdr, int reg_offset)
3067 {
3068 if (reladdr) {
3069 src_reg index = src_reg(this, glsl_type::int_type);
3070
3071 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3072
3073 /* Pre-gen6, the message header uses byte offsets instead of vec4
3074 * (16-byte) offset units.
3075 */
3076 if (brw->gen < 6) {
3077 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3078 }
3079
3080 return index;
3081 } else if (brw->gen >= 8) {
3082 /* Store the offset in a GRF so we can send-from-GRF. */
3083 src_reg offset = src_reg(this, glsl_type::int_type);
3084 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3085 return offset;
3086 } else {
3087 int message_header_scale = brw->gen < 6 ? 16 : 1;
3088 return src_reg(reg_offset * message_header_scale);
3089 }
3090 }
3091
3092 /**
3093 * Emits an instruction before @inst to load the value named by @orig_src
3094 * from scratch space at @base_offset to @temp.
3095 *
3096 * @base_offset is measured in 32-byte units (the size of a register).
3097 */
3098 void
3099 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3100 dst_reg temp, src_reg orig_src,
3101 int base_offset)
3102 {
3103 int reg_offset = base_offset + orig_src.reg_offset;
3104 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3105
3106 emit_before(inst, SCRATCH_READ(temp, index));
3107 }
3108
3109 /**
3110 * Emits an instruction after @inst to store the value to be written
3111 * to @orig_dst to scratch space at @base_offset, from @temp.
3112 *
3113 * @base_offset is measured in 32-byte units (the size of a register).
3114 */
3115 void
3116 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3117 {
3118 int reg_offset = base_offset + inst->dst.reg_offset;
3119 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3120
3121 /* Create a temporary register to store *inst's result in.
3122 *
3123 * We have to be careful in MOVing from our temporary result register in
3124 * the scratch write. If we swizzle from channels of the temporary that
3125 * weren't initialized, it will confuse live interval analysis, which will
3126 * make spilling fail to make progress.
3127 */
3128 src_reg temp = src_reg(this, glsl_type::vec4_type);
3129 temp.type = inst->dst.type;
3130 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3131 int swizzles[4];
3132 for (int i = 0; i < 4; i++)
3133 if (inst->dst.writemask & (1 << i))
3134 swizzles[i] = i;
3135 else
3136 swizzles[i] = first_writemask_chan;
3137 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3138 swizzles[2], swizzles[3]);
3139
3140 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3141 inst->dst.writemask));
3142 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3143 write->predicate = inst->predicate;
3144 write->ir = inst->ir;
3145 write->annotation = inst->annotation;
3146 inst->insert_after(write);
3147
3148 inst->dst.file = temp.file;
3149 inst->dst.reg = temp.reg;
3150 inst->dst.reg_offset = temp.reg_offset;
3151 inst->dst.reladdr = NULL;
3152 }
3153
3154 /**
3155 * We can't generally support array access in GRF space, because a
3156 * single instruction's destination can only span 2 contiguous
3157 * registers. So, we send all GRF arrays that get variable index
3158 * access to scratch space.
3159 */
3160 void
3161 vec4_visitor::move_grf_array_access_to_scratch()
3162 {
3163 int scratch_loc[this->virtual_grf_count];
3164
3165 for (int i = 0; i < this->virtual_grf_count; i++) {
3166 scratch_loc[i] = -1;
3167 }
3168
3169 /* First, calculate the set of virtual GRFs that need to be punted
3170 * to scratch due to having any array access on them, and where in
3171 * scratch.
3172 */
3173 foreach_list(node, &this->instructions) {
3174 vec4_instruction *inst = (vec4_instruction *)node;
3175
3176 if (inst->dst.file == GRF && inst->dst.reladdr &&
3177 scratch_loc[inst->dst.reg] == -1) {
3178 scratch_loc[inst->dst.reg] = c->last_scratch;
3179 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3180 }
3181
3182 for (int i = 0 ; i < 3; i++) {
3183 src_reg *src = &inst->src[i];
3184
3185 if (src->file == GRF && src->reladdr &&
3186 scratch_loc[src->reg] == -1) {
3187 scratch_loc[src->reg] = c->last_scratch;
3188 c->last_scratch += this->virtual_grf_sizes[src->reg];
3189 }
3190 }
3191 }
3192
3193 /* Now, for anything that will be accessed through scratch, rewrite
3194 * it to load/store. Note that this is a _safe list walk, because
3195 * we may generate a new scratch_write instruction after the one
3196 * we're processing.
3197 */
3198 foreach_list_safe(node, &this->instructions) {
3199 vec4_instruction *inst = (vec4_instruction *)node;
3200
3201 /* Set up the annotation tracking for new generated instructions. */
3202 base_ir = inst->ir;
3203 current_annotation = inst->annotation;
3204
3205 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3206 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3207 }
3208
3209 for (int i = 0 ; i < 3; i++) {
3210 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3211 continue;
3212
3213 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3214
3215 emit_scratch_read(inst, temp, inst->src[i],
3216 scratch_loc[inst->src[i].reg]);
3217
3218 inst->src[i].file = temp.file;
3219 inst->src[i].reg = temp.reg;
3220 inst->src[i].reg_offset = temp.reg_offset;
3221 inst->src[i].reladdr = NULL;
3222 }
3223 }
3224 }
3225
3226 /**
3227 * Emits an instruction before @inst to load the value named by @orig_src
3228 * from the pull constant buffer (surface) at @base_offset to @temp.
3229 */
3230 void
3231 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3232 dst_reg temp, src_reg orig_src,
3233 int base_offset)
3234 {
3235 int reg_offset = base_offset + orig_src.reg_offset;
3236 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3237 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3238 vec4_instruction *load;
3239
3240 if (brw->gen >= 7) {
3241 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3242 grf_offset.type = offset.type;
3243 emit_before(inst, MOV(grf_offset, offset));
3244
3245 load = new(mem_ctx) vec4_instruction(this,
3246 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3247 temp, index, src_reg(grf_offset));
3248 } else {
3249 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3250 temp, index, offset);
3251 load->base_mrf = 14;
3252 load->mlen = 1;
3253 }
3254 emit_before(inst, load);
3255 }
3256
3257 /**
3258 * Implements array access of uniforms by inserting a
3259 * PULL_CONSTANT_LOAD instruction.
3260 *
3261 * Unlike temporary GRF array access (where we don't support it due to
3262 * the difficulty of doing relative addressing on instruction
3263 * destinations), we could potentially do array access of uniforms
3264 * that were loaded in GRF space as push constants. In real-world
3265 * usage we've seen, though, the arrays being used are always larger
3266 * than we could load as push constants, so just always move all
3267 * uniform array access out to a pull constant buffer.
3268 */
3269 void
3270 vec4_visitor::move_uniform_array_access_to_pull_constants()
3271 {
3272 int pull_constant_loc[this->uniforms];
3273
3274 for (int i = 0; i < this->uniforms; i++) {
3275 pull_constant_loc[i] = -1;
3276 }
3277
3278 /* Walk through and find array access of uniforms. Put a copy of that
3279 * uniform in the pull constant buffer.
3280 *
3281 * Note that we don't move constant-indexed accesses to arrays. No
3282 * testing has been done of the performance impact of this choice.
3283 */
3284 foreach_list_safe(node, &this->instructions) {
3285 vec4_instruction *inst = (vec4_instruction *)node;
3286
3287 for (int i = 0 ; i < 3; i++) {
3288 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3289 continue;
3290
3291 int uniform = inst->src[i].reg;
3292
3293 /* If this array isn't already present in the pull constant buffer,
3294 * add it.
3295 */
3296 if (pull_constant_loc[uniform] == -1) {
3297 const float **values = &stage_prog_data->param[uniform * 4];
3298
3299 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3300
3301 assert(uniform < uniform_array_size);
3302 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3303 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3304 = values[j];
3305 }
3306 }
3307
3308 /* Set up the annotation tracking for new generated instructions. */
3309 base_ir = inst->ir;
3310 current_annotation = inst->annotation;
3311
3312 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3313
3314 emit_pull_constant_load(inst, temp, inst->src[i],
3315 pull_constant_loc[uniform]);
3316
3317 inst->src[i].file = temp.file;
3318 inst->src[i].reg = temp.reg;
3319 inst->src[i].reg_offset = temp.reg_offset;
3320 inst->src[i].reladdr = NULL;
3321 }
3322 }
3323
3324 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3325 * no need to track them as larger-than-vec4 objects. This will be
3326 * relied on in cutting out unused uniform vectors from push
3327 * constants.
3328 */
3329 split_uniform_registers();
3330 }
3331
3332 void
3333 vec4_visitor::resolve_ud_negate(src_reg *reg)
3334 {
3335 if (reg->type != BRW_REGISTER_TYPE_UD ||
3336 !reg->negate)
3337 return;
3338
3339 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3340 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3341 *reg = temp;
3342 }
3343
3344 vec4_visitor::vec4_visitor(struct brw_context *brw,
3345 struct brw_vec4_compile *c,
3346 struct gl_program *prog,
3347 const struct brw_vec4_prog_key *key,
3348 struct brw_vec4_prog_data *prog_data,
3349 struct gl_shader_program *shader_prog,
3350 gl_shader_stage stage,
3351 void *mem_ctx,
3352 bool debug_flag,
3353 bool no_spills,
3354 shader_time_shader_type st_base,
3355 shader_time_shader_type st_written,
3356 shader_time_shader_type st_reset)
3357 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3358 c(c),
3359 key(key),
3360 prog_data(prog_data),
3361 sanity_param_count(0),
3362 fail_msg(NULL),
3363 first_non_payload_grf(0),
3364 need_all_constants_in_pull_buffer(false),
3365 debug_flag(debug_flag),
3366 no_spills(no_spills),
3367 st_base(st_base),
3368 st_written(st_written),
3369 st_reset(st_reset)
3370 {
3371 this->mem_ctx = mem_ctx;
3372 this->failed = false;
3373
3374 this->base_ir = NULL;
3375 this->current_annotation = NULL;
3376 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3377
3378 this->variable_ht = hash_table_ctor(0,
3379 hash_table_pointer_hash,
3380 hash_table_pointer_compare);
3381
3382 this->virtual_grf_start = NULL;
3383 this->virtual_grf_end = NULL;
3384 this->virtual_grf_sizes = NULL;
3385 this->virtual_grf_count = 0;
3386 this->virtual_grf_reg_map = NULL;
3387 this->virtual_grf_reg_count = 0;
3388 this->virtual_grf_array_size = 0;
3389 this->live_intervals_valid = false;
3390
3391 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3392
3393 this->uniforms = 0;
3394
3395 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3396 * at least one. See setup_uniforms() in brw_vec4.cpp.
3397 */
3398 this->uniform_array_size = 1;
3399 if (prog_data) {
3400 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3401 }
3402
3403 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3404 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3405 }
3406
3407 vec4_visitor::~vec4_visitor()
3408 {
3409 hash_table_dtor(this->variable_ht);
3410 }
3411
3412
3413 void
3414 vec4_visitor::fail(const char *format, ...)
3415 {
3416 va_list va;
3417 char *msg;
3418
3419 if (failed)
3420 return;
3421
3422 failed = true;
3423
3424 va_start(va, format);
3425 msg = ralloc_vasprintf(mem_ctx, format, va);
3426 va_end(va);
3427 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3428
3429 this->fail_msg = msg;
3430
3431 if (debug_flag) {
3432 fprintf(stderr, "%s", msg);
3433 }
3434 }
3435
3436 } /* namespace brw */