i965/vec4: Add a brw->gen >= 6 assertion in three-source emitters.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 assert(brw->gen >= 6); \
132 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
133 src0, src1, src2); \
134 }
135
136 ALU1(NOT)
137 ALU1(MOV)
138 ALU1(FRC)
139 ALU1(RNDD)
140 ALU1(RNDE)
141 ALU1(RNDZ)
142 ALU1(F32TO16)
143 ALU1(F16TO32)
144 ALU2(ADD)
145 ALU2(MUL)
146 ALU2(MACH)
147 ALU2(AND)
148 ALU2(OR)
149 ALU2(XOR)
150 ALU2(DP3)
151 ALU2(DP4)
152 ALU2(DPH)
153 ALU2(SHL)
154 ALU2(SHR)
155 ALU2(ASR)
156 ALU3(LRP)
157 ALU1(BFREV)
158 ALU3(BFE)
159 ALU2(BFI1)
160 ALU3(BFI2)
161 ALU1(FBH)
162 ALU1(FBL)
163 ALU1(CBIT)
164 ALU3(MAD)
165 ALU2(ADDC)
166 ALU2(SUBB)
167
168 /** Gen4 predicated IF. */
169 vec4_instruction *
170 vec4_visitor::IF(uint32_t predicate)
171 {
172 vec4_instruction *inst;
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
175 inst->predicate = predicate;
176
177 return inst;
178 }
179
180 /** Gen6 IF with embedded comparison. */
181 vec4_instruction *
182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
183 {
184 assert(brw->gen == 6);
185
186 vec4_instruction *inst;
187
188 resolve_ud_negate(&src0);
189 resolve_ud_negate(&src1);
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
192 src0, src1);
193 inst->conditional_mod = condition;
194
195 return inst;
196 }
197
198 /**
199 * CMP: Sets the low bit of the destination channels with the result
200 * of the comparison, while the upper bits are undefined, and updates
201 * the flag register with the packed 16 bits of the result.
202 */
203 vec4_instruction *
204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
205 {
206 vec4_instruction *inst;
207
208 /* original gen4 does type conversion to the destination type
209 * before before comparison, producing garbage results for floating
210 * point comparisons.
211 */
212 if (brw->gen == 4) {
213 dst.type = src0.type;
214 if (dst.file == HW_REG)
215 dst.fixed_hw_reg.type = dst.type;
216 }
217
218 resolve_ud_negate(&src0);
219 resolve_ud_negate(&src1);
220
221 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
222 inst->conditional_mod = condition;
223
224 return inst;
225 }
226
227 vec4_instruction *
228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
229 {
230 vec4_instruction *inst;
231
232 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
233 dst, index);
234 inst->base_mrf = 14;
235 inst->mlen = 2;
236
237 return inst;
238 }
239
240 vec4_instruction *
241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
242 {
243 vec4_instruction *inst;
244
245 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
246 dst, src, index);
247 inst->base_mrf = 13;
248 inst->mlen = 3;
249
250 return inst;
251 }
252
253 void
254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
255 {
256 static enum opcode dot_opcodes[] = {
257 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
258 };
259
260 emit(dot_opcodes[elements - 2], dst, src0, src1);
261 }
262
263 src_reg
264 vec4_visitor::fix_3src_operand(src_reg src)
265 {
266 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
267 * able to use vertical stride of zero to replicate the vec4 uniform, like
268 *
269 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
270 *
271 * But you can't, since vertical stride is always four in three-source
272 * instructions. Instead, insert a MOV instruction to do the replication so
273 * that the three-source instruction can consume it.
274 */
275
276 /* The MOV is only needed if the source is a uniform or immediate. */
277 if (src.file != UNIFORM && src.file != IMM)
278 return src;
279
280 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
281 expanded.type = src.type;
282 emit(MOV(expanded, src));
283 return src_reg(expanded);
284 }
285
286 src_reg
287 vec4_visitor::fix_math_operand(src_reg src)
288 {
289 /* The gen6 math instruction ignores the source modifiers --
290 * swizzle, abs, negate, and at least some parts of the register
291 * region description.
292 *
293 * Rather than trying to enumerate all these cases, *always* expand the
294 * operand to a temp GRF for gen6.
295 *
296 * For gen7, keep the operand as-is, except if immediate, which gen7 still
297 * can't use.
298 */
299
300 if (brw->gen == 7 && src.file != IMM)
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 void
310 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
311 {
312 src = fix_math_operand(src);
313
314 if (dst.writemask != WRITEMASK_XYZW) {
315 /* The gen6 math instruction must be align1, so we can't do
316 * writemasks.
317 */
318 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
319
320 emit(opcode, temp_dst, src);
321
322 emit(MOV(dst, src_reg(temp_dst)));
323 } else {
324 emit(opcode, dst, src);
325 }
326 }
327
328 void
329 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
330 {
331 vec4_instruction *inst = emit(opcode, dst, src);
332 inst->base_mrf = 1;
333 inst->mlen = 1;
334 }
335
336 void
337 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
338 {
339 switch (opcode) {
340 case SHADER_OPCODE_RCP:
341 case SHADER_OPCODE_RSQ:
342 case SHADER_OPCODE_SQRT:
343 case SHADER_OPCODE_EXP2:
344 case SHADER_OPCODE_LOG2:
345 case SHADER_OPCODE_SIN:
346 case SHADER_OPCODE_COS:
347 break;
348 default:
349 assert(!"not reached: bad math opcode");
350 return;
351 }
352
353 if (brw->gen >= 6) {
354 return emit_math1_gen6(opcode, dst, src);
355 } else {
356 return emit_math1_gen4(opcode, dst, src);
357 }
358 }
359
360 void
361 vec4_visitor::emit_math2_gen6(enum opcode opcode,
362 dst_reg dst, src_reg src0, src_reg src1)
363 {
364 src0 = fix_math_operand(src0);
365 src1 = fix_math_operand(src1);
366
367 if (dst.writemask != WRITEMASK_XYZW) {
368 /* The gen6 math instruction must be align1, so we can't do
369 * writemasks.
370 */
371 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
372 temp_dst.type = dst.type;
373
374 emit(opcode, temp_dst, src0, src1);
375
376 emit(MOV(dst, src_reg(temp_dst)));
377 } else {
378 emit(opcode, dst, src0, src1);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen4(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 vec4_instruction *inst = emit(opcode, dst, src0, src1);
387 inst->base_mrf = 1;
388 inst->mlen = 2;
389 }
390
391 void
392 vec4_visitor::emit_math(enum opcode opcode,
393 dst_reg dst, src_reg src0, src_reg src1)
394 {
395 switch (opcode) {
396 case SHADER_OPCODE_POW:
397 case SHADER_OPCODE_INT_QUOTIENT:
398 case SHADER_OPCODE_INT_REMAINDER:
399 break;
400 default:
401 assert(!"not reached: unsupported binary math opcode");
402 return;
403 }
404
405 if (brw->gen >= 6) {
406 return emit_math2_gen6(opcode, dst, src0, src1);
407 } else {
408 return emit_math2_gen4(opcode, dst, src0, src1);
409 }
410 }
411
412 void
413 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
414 {
415 if (brw->gen < 7)
416 assert(!"ir_unop_pack_half_2x16 should be lowered");
417
418 assert(dst.type == BRW_REGISTER_TYPE_UD);
419 assert(src0.type == BRW_REGISTER_TYPE_F);
420
421 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
422 *
423 * Because this instruction does not have a 16-bit floating-point type,
424 * the destination data type must be Word (W).
425 *
426 * The destination must be DWord-aligned and specify a horizontal stride
427 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
428 * each destination channel and the upper word is not modified.
429 *
430 * The above restriction implies that the f32to16 instruction must use
431 * align1 mode, because only in align1 mode is it possible to specify
432 * horizontal stride. We choose here to defy the hardware docs and emit
433 * align16 instructions.
434 *
435 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
436 * instructions. I was partially successful in that the code passed all
437 * tests. However, the code was dubiously correct and fragile, and the
438 * tests were not harsh enough to probe that frailty. Not trusting the
439 * code, I chose instead to remain in align16 mode in defiance of the hw
440 * docs).
441 *
442 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
443 * simulator, emitting a f32to16 in align16 mode with UD as destination
444 * data type is safe. The behavior differs from that specified in the PRM
445 * in that the upper word of each destination channel is cleared to 0.
446 */
447
448 dst_reg tmp_dst(this, glsl_type::uvec2_type);
449 src_reg tmp_src(tmp_dst);
450
451 #if 0
452 /* Verify the undocumented behavior on which the following instructions
453 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
454 * then the result of the bit-or instruction below will be incorrect.
455 *
456 * You should inspect the disasm output in order to verify that the MOV is
457 * not optimized away.
458 */
459 emit(MOV(tmp_dst, src_reg(0x12345678u)));
460 #endif
461
462 /* Give tmp the form below, where "." means untouched.
463 *
464 * w z y x w z y x
465 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
466 *
467 * That the upper word of each write-channel be 0 is required for the
468 * following bit-shift and bit-or instructions to work. Note that this
469 * relies on the undocumented hardware behavior mentioned above.
470 */
471 tmp_dst.writemask = WRITEMASK_XY;
472 emit(F32TO16(tmp_dst, src0));
473
474 /* Give the write-channels of dst the form:
475 * 0xhhhh0000
476 */
477 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
478 emit(SHL(dst, tmp_src, src_reg(16u)));
479
480 /* Finally, give the write-channels of dst the form of packHalf2x16's
481 * output:
482 * 0xhhhhllll
483 */
484 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
485 emit(OR(dst, src_reg(dst), tmp_src));
486 }
487
488 void
489 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
490 {
491 if (brw->gen < 7)
492 assert(!"ir_unop_unpack_half_2x16 should be lowered");
493
494 assert(dst.type == BRW_REGISTER_TYPE_F);
495 assert(src0.type == BRW_REGISTER_TYPE_UD);
496
497 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
498 *
499 * Because this instruction does not have a 16-bit floating-point type,
500 * the source data type must be Word (W). The destination type must be
501 * F (Float).
502 *
503 * To use W as the source data type, we must adjust horizontal strides,
504 * which is only possible in align1 mode. All my [chadv] attempts at
505 * emitting align1 instructions for unpackHalf2x16 failed to pass the
506 * Piglit tests, so I gave up.
507 *
508 * I've verified that, on gen7 hardware and the simulator, it is safe to
509 * emit f16to32 in align16 mode with UD as source data type.
510 */
511
512 dst_reg tmp_dst(this, glsl_type::uvec2_type);
513 src_reg tmp_src(tmp_dst);
514
515 tmp_dst.writemask = WRITEMASK_X;
516 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
517
518 tmp_dst.writemask = WRITEMASK_Y;
519 emit(SHR(tmp_dst, src0, src_reg(16u)));
520
521 dst.writemask = WRITEMASK_XY;
522 emit(F16TO32(dst, tmp_src));
523 }
524
525 void
526 vec4_visitor::visit_instructions(const exec_list *list)
527 {
528 foreach_list(node, list) {
529 ir_instruction *ir = (ir_instruction *)node;
530
531 base_ir = ir;
532 ir->accept(this);
533 }
534 }
535
536
537 static int
538 type_size(const struct glsl_type *type)
539 {
540 unsigned int i;
541 int size;
542
543 switch (type->base_type) {
544 case GLSL_TYPE_UINT:
545 case GLSL_TYPE_INT:
546 case GLSL_TYPE_FLOAT:
547 case GLSL_TYPE_BOOL:
548 if (type->is_matrix()) {
549 return type->matrix_columns;
550 } else {
551 /* Regardless of size of vector, it gets a vec4. This is bad
552 * packing for things like floats, but otherwise arrays become a
553 * mess. Hopefully a later pass over the code can pack scalars
554 * down if appropriate.
555 */
556 return 1;
557 }
558 case GLSL_TYPE_ARRAY:
559 assert(type->length > 0);
560 return type_size(type->fields.array) * type->length;
561 case GLSL_TYPE_STRUCT:
562 size = 0;
563 for (i = 0; i < type->length; i++) {
564 size += type_size(type->fields.structure[i].type);
565 }
566 return size;
567 case GLSL_TYPE_SAMPLER:
568 /* Samplers take up one slot in UNIFORMS[], but they're baked in
569 * at link time.
570 */
571 return 1;
572 case GLSL_TYPE_ATOMIC_UINT:
573 return 0;
574 case GLSL_TYPE_IMAGE:
575 case GLSL_TYPE_VOID:
576 case GLSL_TYPE_ERROR:
577 case GLSL_TYPE_INTERFACE:
578 assert(0);
579 break;
580 }
581
582 return 0;
583 }
584
585 int
586 vec4_visitor::virtual_grf_alloc(int size)
587 {
588 if (virtual_grf_array_size <= virtual_grf_count) {
589 if (virtual_grf_array_size == 0)
590 virtual_grf_array_size = 16;
591 else
592 virtual_grf_array_size *= 2;
593 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
594 virtual_grf_array_size);
595 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
596 virtual_grf_array_size);
597 }
598 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
599 virtual_grf_reg_count += size;
600 virtual_grf_sizes[virtual_grf_count] = size;
601 return virtual_grf_count++;
602 }
603
604 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
605 {
606 init();
607
608 this->file = GRF;
609 this->reg = v->virtual_grf_alloc(type_size(type));
610
611 if (type->is_array() || type->is_record()) {
612 this->swizzle = BRW_SWIZZLE_NOOP;
613 } else {
614 this->swizzle = swizzle_for_size(type->vector_elements);
615 }
616
617 this->type = brw_type_for_base_type(type);
618 }
619
620 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
621 {
622 init();
623
624 this->file = GRF;
625 this->reg = v->virtual_grf_alloc(type_size(type));
626
627 if (type->is_array() || type->is_record()) {
628 this->writemask = WRITEMASK_XYZW;
629 } else {
630 this->writemask = (1 << type->vector_elements) - 1;
631 }
632
633 this->type = brw_type_for_base_type(type);
634 }
635
636 /* Our support for uniforms is piggy-backed on the struct
637 * gl_fragment_program, because that's where the values actually
638 * get stored, rather than in some global gl_shader_program uniform
639 * store.
640 */
641 void
642 vec4_visitor::setup_uniform_values(ir_variable *ir)
643 {
644 int namelen = strlen(ir->name);
645
646 /* The data for our (non-builtin) uniforms is stored in a series of
647 * gl_uniform_driver_storage structs for each subcomponent that
648 * glGetUniformLocation() could name. We know it's been set up in the same
649 * order we'd walk the type, so walk the list of storage and find anything
650 * with our name, or the prefix of a component that starts with our name.
651 */
652 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
653 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
654
655 if (strncmp(ir->name, storage->name, namelen) != 0 ||
656 (storage->name[namelen] != 0 &&
657 storage->name[namelen] != '.' &&
658 storage->name[namelen] != '[')) {
659 continue;
660 }
661
662 gl_constant_value *components = storage->storage;
663 unsigned vector_count = (MAX2(storage->array_elements, 1) *
664 storage->type->matrix_columns);
665
666 for (unsigned s = 0; s < vector_count; s++) {
667 uniform_vector_size[uniforms] = storage->type->vector_elements;
668
669 int i;
670 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
671 stage_prog_data->param[uniforms * 4 + i] = &components->f;
672 components++;
673 }
674 for (; i < 4; i++) {
675 static float zero = 0;
676 stage_prog_data->param[uniforms * 4 + i] = &zero;
677 }
678
679 uniforms++;
680 }
681 }
682 }
683
684 void
685 vec4_visitor::setup_uniform_clipplane_values()
686 {
687 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
688
689 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
690 this->uniform_vector_size[this->uniforms] = 4;
691 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
692 this->userplane[i].type = BRW_REGISTER_TYPE_F;
693 for (int j = 0; j < 4; ++j) {
694 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
695 }
696 ++this->uniforms;
697 }
698 }
699
700 /* Our support for builtin uniforms is even scarier than non-builtin.
701 * It sits on top of the PROG_STATE_VAR parameters that are
702 * automatically updated from GL context state.
703 */
704 void
705 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
706 {
707 const ir_state_slot *const slots = ir->state_slots;
708 assert(ir->state_slots != NULL);
709
710 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
711 /* This state reference has already been setup by ir_to_mesa,
712 * but we'll get the same index back here. We can reference
713 * ParameterValues directly, since unlike brw_fs.cpp, we never
714 * add new state references during compile.
715 */
716 int index = _mesa_add_state_reference(this->prog->Parameters,
717 (gl_state_index *)slots[i].tokens);
718 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
719
720 this->uniform_vector_size[this->uniforms] = 0;
721 /* Add each of the unique swizzled channels of the element.
722 * This will end up matching the size of the glsl_type of this field.
723 */
724 int last_swiz = -1;
725 for (unsigned int j = 0; j < 4; j++) {
726 int swiz = GET_SWZ(slots[i].swizzle, j);
727 last_swiz = swiz;
728
729 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
730 if (swiz <= last_swiz)
731 this->uniform_vector_size[this->uniforms]++;
732 }
733 this->uniforms++;
734 }
735 }
736
737 dst_reg *
738 vec4_visitor::variable_storage(ir_variable *var)
739 {
740 return (dst_reg *)hash_table_find(this->variable_ht, var);
741 }
742
743 void
744 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
745 {
746 ir_expression *expr = ir->as_expression();
747
748 *predicate = BRW_PREDICATE_NORMAL;
749
750 if (expr) {
751 src_reg op[2];
752 vec4_instruction *inst;
753
754 assert(expr->get_num_operands() <= 2);
755 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
756 expr->operands[i]->accept(this);
757 op[i] = this->result;
758
759 resolve_ud_negate(&op[i]);
760 }
761
762 switch (expr->operation) {
763 case ir_unop_logic_not:
764 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
765 inst->conditional_mod = BRW_CONDITIONAL_Z;
766 break;
767
768 case ir_binop_logic_xor:
769 inst = emit(XOR(dst_null_d(), op[0], op[1]));
770 inst->conditional_mod = BRW_CONDITIONAL_NZ;
771 break;
772
773 case ir_binop_logic_or:
774 inst = emit(OR(dst_null_d(), op[0], op[1]));
775 inst->conditional_mod = BRW_CONDITIONAL_NZ;
776 break;
777
778 case ir_binop_logic_and:
779 inst = emit(AND(dst_null_d(), op[0], op[1]));
780 inst->conditional_mod = BRW_CONDITIONAL_NZ;
781 break;
782
783 case ir_unop_f2b:
784 if (brw->gen >= 6) {
785 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
786 } else {
787 inst = emit(MOV(dst_null_f(), op[0]));
788 inst->conditional_mod = BRW_CONDITIONAL_NZ;
789 }
790 break;
791
792 case ir_unop_i2b:
793 if (brw->gen >= 6) {
794 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
795 } else {
796 inst = emit(MOV(dst_null_d(), op[0]));
797 inst->conditional_mod = BRW_CONDITIONAL_NZ;
798 }
799 break;
800
801 case ir_binop_all_equal:
802 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
803 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
804 break;
805
806 case ir_binop_any_nequal:
807 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
808 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
809 break;
810
811 case ir_unop_any:
812 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
813 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
814 break;
815
816 case ir_binop_greater:
817 case ir_binop_gequal:
818 case ir_binop_less:
819 case ir_binop_lequal:
820 case ir_binop_equal:
821 case ir_binop_nequal:
822 emit(CMP(dst_null_d(), op[0], op[1],
823 brw_conditional_for_comparison(expr->operation)));
824 break;
825
826 default:
827 assert(!"not reached");
828 break;
829 }
830 return;
831 }
832
833 ir->accept(this);
834
835 resolve_ud_negate(&this->result);
836
837 if (brw->gen >= 6) {
838 vec4_instruction *inst = emit(AND(dst_null_d(),
839 this->result, src_reg(1)));
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 } else {
842 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
843 inst->conditional_mod = BRW_CONDITIONAL_NZ;
844 }
845 }
846
847 /**
848 * Emit a gen6 IF statement with the comparison folded into the IF
849 * instruction.
850 */
851 void
852 vec4_visitor::emit_if_gen6(ir_if *ir)
853 {
854 ir_expression *expr = ir->condition->as_expression();
855
856 if (expr) {
857 src_reg op[2];
858 dst_reg temp;
859
860 assert(expr->get_num_operands() <= 2);
861 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
862 expr->operands[i]->accept(this);
863 op[i] = this->result;
864 }
865
866 switch (expr->operation) {
867 case ir_unop_logic_not:
868 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
869 return;
870
871 case ir_binop_logic_xor:
872 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
873 return;
874
875 case ir_binop_logic_or:
876 temp = dst_reg(this, glsl_type::bool_type);
877 emit(OR(temp, op[0], op[1]));
878 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
879 return;
880
881 case ir_binop_logic_and:
882 temp = dst_reg(this, glsl_type::bool_type);
883 emit(AND(temp, op[0], op[1]));
884 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
885 return;
886
887 case ir_unop_f2b:
888 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
889 return;
890
891 case ir_unop_i2b:
892 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
893 return;
894
895 case ir_binop_greater:
896 case ir_binop_gequal:
897 case ir_binop_less:
898 case ir_binop_lequal:
899 case ir_binop_equal:
900 case ir_binop_nequal:
901 emit(IF(op[0], op[1],
902 brw_conditional_for_comparison(expr->operation)));
903 return;
904
905 case ir_binop_all_equal:
906 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
907 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
908 return;
909
910 case ir_binop_any_nequal:
911 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
912 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
913 return;
914
915 case ir_unop_any:
916 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
918 return;
919
920 default:
921 assert(!"not reached");
922 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
923 return;
924 }
925 return;
926 }
927
928 ir->condition->accept(this);
929
930 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
931 }
932
933 void
934 vec4_visitor::visit(ir_variable *ir)
935 {
936 dst_reg *reg = NULL;
937
938 if (variable_storage(ir))
939 return;
940
941 switch (ir->data.mode) {
942 case ir_var_shader_in:
943 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
944 break;
945
946 case ir_var_shader_out:
947 reg = new(mem_ctx) dst_reg(this, ir->type);
948
949 for (int i = 0; i < type_size(ir->type); i++) {
950 output_reg[ir->data.location + i] = *reg;
951 output_reg[ir->data.location + i].reg_offset = i;
952 output_reg[ir->data.location + i].type =
953 brw_type_for_base_type(ir->type->get_scalar_type());
954 output_reg_annotation[ir->data.location + i] = ir->name;
955 }
956 break;
957
958 case ir_var_auto:
959 case ir_var_temporary:
960 reg = new(mem_ctx) dst_reg(this, ir->type);
961 break;
962
963 case ir_var_uniform:
964 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
965
966 /* Thanks to the lower_ubo_reference pass, we will see only
967 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
968 * variables, so no need for them to be in variable_ht.
969 *
970 * Atomic counters take no uniform storage, no need to do
971 * anything here.
972 */
973 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
974 return;
975
976 /* Track how big the whole uniform variable is, in case we need to put a
977 * copy of its data into pull constants for array access.
978 */
979 this->uniform_size[this->uniforms] = type_size(ir->type);
980
981 if (!strncmp(ir->name, "gl_", 3)) {
982 setup_builtin_uniform_values(ir);
983 } else {
984 setup_uniform_values(ir);
985 }
986 break;
987
988 case ir_var_system_value:
989 reg = make_reg_for_system_value(ir);
990 break;
991
992 default:
993 assert(!"not reached");
994 }
995
996 reg->type = brw_type_for_base_type(ir->type);
997 hash_table_insert(this->variable_ht, reg, ir);
998 }
999
1000 void
1001 vec4_visitor::visit(ir_loop *ir)
1002 {
1003 /* We don't want debugging output to print the whole body of the
1004 * loop as the annotation.
1005 */
1006 this->base_ir = NULL;
1007
1008 emit(BRW_OPCODE_DO);
1009
1010 visit_instructions(&ir->body_instructions);
1011
1012 emit(BRW_OPCODE_WHILE);
1013 }
1014
1015 void
1016 vec4_visitor::visit(ir_loop_jump *ir)
1017 {
1018 switch (ir->mode) {
1019 case ir_loop_jump::jump_break:
1020 emit(BRW_OPCODE_BREAK);
1021 break;
1022 case ir_loop_jump::jump_continue:
1023 emit(BRW_OPCODE_CONTINUE);
1024 break;
1025 }
1026 }
1027
1028
1029 void
1030 vec4_visitor::visit(ir_function_signature *ir)
1031 {
1032 assert(0);
1033 (void)ir;
1034 }
1035
1036 void
1037 vec4_visitor::visit(ir_function *ir)
1038 {
1039 /* Ignore function bodies other than main() -- we shouldn't see calls to
1040 * them since they should all be inlined.
1041 */
1042 if (strcmp(ir->name, "main") == 0) {
1043 const ir_function_signature *sig;
1044 exec_list empty;
1045
1046 sig = ir->matching_signature(NULL, &empty);
1047
1048 assert(sig);
1049
1050 visit_instructions(&sig->body);
1051 }
1052 }
1053
1054 bool
1055 vec4_visitor::try_emit_sat(ir_expression *ir)
1056 {
1057 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1058 if (!sat_src)
1059 return false;
1060
1061 sat_src->accept(this);
1062 src_reg src = this->result;
1063
1064 this->result = src_reg(this, ir->type);
1065 vec4_instruction *inst;
1066 inst = emit(MOV(dst_reg(this->result), src));
1067 inst->saturate = true;
1068
1069 return true;
1070 }
1071
1072 bool
1073 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1074 {
1075 /* 3-src instructions were introduced in gen6. */
1076 if (brw->gen < 6)
1077 return false;
1078
1079 /* MAD can only handle floating-point data. */
1080 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1081 return false;
1082
1083 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1084 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1085
1086 if (!mul || mul->operation != ir_binop_mul)
1087 return false;
1088
1089 nonmul->accept(this);
1090 src_reg src0 = fix_3src_operand(this->result);
1091
1092 mul->operands[0]->accept(this);
1093 src_reg src1 = fix_3src_operand(this->result);
1094
1095 mul->operands[1]->accept(this);
1096 src_reg src2 = fix_3src_operand(this->result);
1097
1098 this->result = src_reg(this, ir->type);
1099 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1100
1101 return true;
1102 }
1103
1104 void
1105 vec4_visitor::emit_bool_comparison(unsigned int op,
1106 dst_reg dst, src_reg src0, src_reg src1)
1107 {
1108 /* original gen4 does destination conversion before comparison. */
1109 if (brw->gen < 5)
1110 dst.type = src0.type;
1111
1112 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1113
1114 dst.type = BRW_REGISTER_TYPE_D;
1115 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1116 }
1117
1118 void
1119 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1120 src_reg src0, src_reg src1)
1121 {
1122 vec4_instruction *inst;
1123
1124 if (brw->gen >= 6) {
1125 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1126 inst->conditional_mod = conditionalmod;
1127 } else {
1128 emit(CMP(dst, src0, src1, conditionalmod));
1129
1130 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1131 inst->predicate = BRW_PREDICATE_NORMAL;
1132 }
1133 }
1134
1135 static bool
1136 is_16bit_constant(ir_rvalue *rvalue)
1137 {
1138 ir_constant *constant = rvalue->as_constant();
1139 if (!constant)
1140 return false;
1141
1142 if (constant->type != glsl_type::int_type &&
1143 constant->type != glsl_type::uint_type)
1144 return false;
1145
1146 return constant->value.u[0] < (1 << 16);
1147 }
1148
1149 void
1150 vec4_visitor::visit(ir_expression *ir)
1151 {
1152 unsigned int operand;
1153 src_reg op[Elements(ir->operands)];
1154 src_reg result_src;
1155 dst_reg result_dst;
1156 vec4_instruction *inst;
1157
1158 if (try_emit_sat(ir))
1159 return;
1160
1161 if (ir->operation == ir_binop_add) {
1162 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1163 return;
1164 }
1165
1166 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1167 this->result.file = BAD_FILE;
1168 ir->operands[operand]->accept(this);
1169 if (this->result.file == BAD_FILE) {
1170 fprintf(stderr, "Failed to get tree for expression operand:\n");
1171 ir->operands[operand]->fprint(stderr);
1172 exit(1);
1173 }
1174 op[operand] = this->result;
1175
1176 /* Matrix expression operands should have been broken down to vector
1177 * operations already.
1178 */
1179 assert(!ir->operands[operand]->type->is_matrix());
1180 }
1181
1182 int vector_elements = ir->operands[0]->type->vector_elements;
1183 if (ir->operands[1]) {
1184 vector_elements = MAX2(vector_elements,
1185 ir->operands[1]->type->vector_elements);
1186 }
1187
1188 this->result.file = BAD_FILE;
1189
1190 /* Storage for our result. Ideally for an assignment we'd be using
1191 * the actual storage for the result here, instead.
1192 */
1193 result_src = src_reg(this, ir->type);
1194 /* convenience for the emit functions below. */
1195 result_dst = dst_reg(result_src);
1196 /* If nothing special happens, this is the result. */
1197 this->result = result_src;
1198 /* Limit writes to the channels that will be used by result_src later.
1199 * This does limit this temp's use as a temporary for multi-instruction
1200 * sequences.
1201 */
1202 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1203
1204 switch (ir->operation) {
1205 case ir_unop_logic_not:
1206 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1207 * ones complement of the whole register, not just bit 0.
1208 */
1209 emit(XOR(result_dst, op[0], src_reg(1)));
1210 break;
1211 case ir_unop_neg:
1212 op[0].negate = !op[0].negate;
1213 emit(MOV(result_dst, op[0]));
1214 break;
1215 case ir_unop_abs:
1216 op[0].abs = true;
1217 op[0].negate = false;
1218 emit(MOV(result_dst, op[0]));
1219 break;
1220
1221 case ir_unop_sign:
1222 if (ir->type->is_float()) {
1223 /* AND(val, 0x80000000) gives the sign bit.
1224 *
1225 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1226 * zero.
1227 */
1228 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1229
1230 op[0].type = BRW_REGISTER_TYPE_UD;
1231 result_dst.type = BRW_REGISTER_TYPE_UD;
1232 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1233
1234 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1235 inst->predicate = BRW_PREDICATE_NORMAL;
1236
1237 this->result.type = BRW_REGISTER_TYPE_F;
1238 } else {
1239 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1240 * -> non-negative val generates 0x00000000.
1241 * Predicated OR sets 1 if val is positive.
1242 */
1243 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1244
1245 emit(ASR(result_dst, op[0], src_reg(31)));
1246
1247 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1248 inst->predicate = BRW_PREDICATE_NORMAL;
1249 }
1250 break;
1251
1252 case ir_unop_rcp:
1253 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1254 break;
1255
1256 case ir_unop_exp2:
1257 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1258 break;
1259 case ir_unop_log2:
1260 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1261 break;
1262 case ir_unop_exp:
1263 case ir_unop_log:
1264 assert(!"not reached: should be handled by ir_explog_to_explog2");
1265 break;
1266 case ir_unop_sin:
1267 case ir_unop_sin_reduced:
1268 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1269 break;
1270 case ir_unop_cos:
1271 case ir_unop_cos_reduced:
1272 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1273 break;
1274
1275 case ir_unop_dFdx:
1276 case ir_unop_dFdy:
1277 assert(!"derivatives not valid in vertex shader");
1278 break;
1279
1280 case ir_unop_bitfield_reverse:
1281 emit(BFREV(result_dst, op[0]));
1282 break;
1283 case ir_unop_bit_count:
1284 emit(CBIT(result_dst, op[0]));
1285 break;
1286 case ir_unop_find_msb: {
1287 src_reg temp = src_reg(this, glsl_type::uint_type);
1288
1289 inst = emit(FBH(dst_reg(temp), op[0]));
1290 inst->dst.writemask = WRITEMASK_XYZW;
1291
1292 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1293 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1294 * subtract the result from 31 to convert the MSB count into an LSB count.
1295 */
1296
1297 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1298 temp.swizzle = BRW_SWIZZLE_NOOP;
1299 emit(MOV(result_dst, temp));
1300
1301 src_reg src_tmp = src_reg(result_dst);
1302 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1303
1304 src_tmp.negate = true;
1305 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1306 inst->predicate = BRW_PREDICATE_NORMAL;
1307 break;
1308 }
1309 case ir_unop_find_lsb:
1310 emit(FBL(result_dst, op[0]));
1311 break;
1312
1313 case ir_unop_noise:
1314 assert(!"not reached: should be handled by lower_noise");
1315 break;
1316
1317 case ir_binop_add:
1318 emit(ADD(result_dst, op[0], op[1]));
1319 break;
1320 case ir_binop_sub:
1321 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1322 break;
1323
1324 case ir_binop_mul:
1325 if (brw->gen < 8 && ir->type->is_integer()) {
1326 /* For integer multiplication, the MUL uses the low 16 bits of one of
1327 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1328 * accumulates in the contribution of the upper 16 bits of that
1329 * operand. If we can determine that one of the args is in the low
1330 * 16 bits, though, we can just emit a single MUL.
1331 */
1332 if (is_16bit_constant(ir->operands[0])) {
1333 if (brw->gen < 7)
1334 emit(MUL(result_dst, op[0], op[1]));
1335 else
1336 emit(MUL(result_dst, op[1], op[0]));
1337 } else if (is_16bit_constant(ir->operands[1])) {
1338 if (brw->gen < 7)
1339 emit(MUL(result_dst, op[1], op[0]));
1340 else
1341 emit(MUL(result_dst, op[0], op[1]));
1342 } else {
1343 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1344
1345 emit(MUL(acc, op[0], op[1]));
1346 emit(MACH(dst_null_d(), op[0], op[1]));
1347 emit(MOV(result_dst, src_reg(acc)));
1348 }
1349 } else {
1350 emit(MUL(result_dst, op[0], op[1]));
1351 }
1352 break;
1353 case ir_binop_imul_high: {
1354 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1355
1356 emit(MUL(acc, op[0], op[1]));
1357 emit(MACH(result_dst, op[0], op[1]));
1358 break;
1359 }
1360 case ir_binop_div:
1361 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1362 assert(ir->type->is_integer());
1363 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1364 break;
1365 case ir_binop_carry: {
1366 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1367
1368 emit(ADDC(dst_null_ud(), op[0], op[1]));
1369 emit(MOV(result_dst, src_reg(acc)));
1370 break;
1371 }
1372 case ir_binop_borrow: {
1373 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1374
1375 emit(SUBB(dst_null_ud(), op[0], op[1]));
1376 emit(MOV(result_dst, src_reg(acc)));
1377 break;
1378 }
1379 case ir_binop_mod:
1380 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1381 assert(ir->type->is_integer());
1382 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1383 break;
1384
1385 case ir_binop_less:
1386 case ir_binop_greater:
1387 case ir_binop_lequal:
1388 case ir_binop_gequal:
1389 case ir_binop_equal:
1390 case ir_binop_nequal: {
1391 emit(CMP(result_dst, op[0], op[1],
1392 brw_conditional_for_comparison(ir->operation)));
1393 emit(AND(result_dst, result_src, src_reg(0x1)));
1394 break;
1395 }
1396
1397 case ir_binop_all_equal:
1398 /* "==" operator producing a scalar boolean. */
1399 if (ir->operands[0]->type->is_vector() ||
1400 ir->operands[1]->type->is_vector()) {
1401 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1402 emit(MOV(result_dst, src_reg(0)));
1403 inst = emit(MOV(result_dst, src_reg(1)));
1404 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1405 } else {
1406 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1407 emit(AND(result_dst, result_src, src_reg(0x1)));
1408 }
1409 break;
1410 case ir_binop_any_nequal:
1411 /* "!=" operator producing a scalar boolean. */
1412 if (ir->operands[0]->type->is_vector() ||
1413 ir->operands[1]->type->is_vector()) {
1414 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1415
1416 emit(MOV(result_dst, src_reg(0)));
1417 inst = emit(MOV(result_dst, src_reg(1)));
1418 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1419 } else {
1420 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1421 emit(AND(result_dst, result_src, src_reg(0x1)));
1422 }
1423 break;
1424
1425 case ir_unop_any:
1426 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1427 emit(MOV(result_dst, src_reg(0)));
1428
1429 inst = emit(MOV(result_dst, src_reg(1)));
1430 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1431 break;
1432
1433 case ir_binop_logic_xor:
1434 emit(XOR(result_dst, op[0], op[1]));
1435 break;
1436
1437 case ir_binop_logic_or:
1438 emit(OR(result_dst, op[0], op[1]));
1439 break;
1440
1441 case ir_binop_logic_and:
1442 emit(AND(result_dst, op[0], op[1]));
1443 break;
1444
1445 case ir_binop_dot:
1446 assert(ir->operands[0]->type->is_vector());
1447 assert(ir->operands[0]->type == ir->operands[1]->type);
1448 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1449 break;
1450
1451 case ir_unop_sqrt:
1452 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1453 break;
1454 case ir_unop_rsq:
1455 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1456 break;
1457
1458 case ir_unop_bitcast_i2f:
1459 case ir_unop_bitcast_u2f:
1460 this->result = op[0];
1461 this->result.type = BRW_REGISTER_TYPE_F;
1462 break;
1463
1464 case ir_unop_bitcast_f2i:
1465 this->result = op[0];
1466 this->result.type = BRW_REGISTER_TYPE_D;
1467 break;
1468
1469 case ir_unop_bitcast_f2u:
1470 this->result = op[0];
1471 this->result.type = BRW_REGISTER_TYPE_UD;
1472 break;
1473
1474 case ir_unop_i2f:
1475 case ir_unop_i2u:
1476 case ir_unop_u2i:
1477 case ir_unop_u2f:
1478 case ir_unop_b2f:
1479 case ir_unop_b2i:
1480 case ir_unop_f2i:
1481 case ir_unop_f2u:
1482 emit(MOV(result_dst, op[0]));
1483 break;
1484 case ir_unop_f2b:
1485 case ir_unop_i2b: {
1486 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1487 emit(AND(result_dst, result_src, src_reg(1)));
1488 break;
1489 }
1490
1491 case ir_unop_trunc:
1492 emit(RNDZ(result_dst, op[0]));
1493 break;
1494 case ir_unop_ceil:
1495 op[0].negate = !op[0].negate;
1496 inst = emit(RNDD(result_dst, op[0]));
1497 this->result.negate = true;
1498 break;
1499 case ir_unop_floor:
1500 inst = emit(RNDD(result_dst, op[0]));
1501 break;
1502 case ir_unop_fract:
1503 inst = emit(FRC(result_dst, op[0]));
1504 break;
1505 case ir_unop_round_even:
1506 emit(RNDE(result_dst, op[0]));
1507 break;
1508
1509 case ir_binop_min:
1510 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1511 break;
1512 case ir_binop_max:
1513 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1514 break;
1515
1516 case ir_binop_pow:
1517 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1518 break;
1519
1520 case ir_unop_bit_not:
1521 inst = emit(NOT(result_dst, op[0]));
1522 break;
1523 case ir_binop_bit_and:
1524 inst = emit(AND(result_dst, op[0], op[1]));
1525 break;
1526 case ir_binop_bit_xor:
1527 inst = emit(XOR(result_dst, op[0], op[1]));
1528 break;
1529 case ir_binop_bit_or:
1530 inst = emit(OR(result_dst, op[0], op[1]));
1531 break;
1532
1533 case ir_binop_lshift:
1534 inst = emit(SHL(result_dst, op[0], op[1]));
1535 break;
1536
1537 case ir_binop_rshift:
1538 if (ir->type->base_type == GLSL_TYPE_INT)
1539 inst = emit(ASR(result_dst, op[0], op[1]));
1540 else
1541 inst = emit(SHR(result_dst, op[0], op[1]));
1542 break;
1543
1544 case ir_binop_bfm:
1545 emit(BFI1(result_dst, op[0], op[1]));
1546 break;
1547
1548 case ir_binop_ubo_load: {
1549 ir_constant *uniform_block = ir->operands[0]->as_constant();
1550 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1551 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1552 src_reg offset;
1553
1554 /* Now, load the vector from that offset. */
1555 assert(ir->type->is_vector() || ir->type->is_scalar());
1556
1557 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1558 packed_consts.type = result.type;
1559 src_reg surf_index =
1560 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1561 if (const_offset_ir) {
1562 if (brw->gen >= 8) {
1563 /* Store the offset in a GRF so we can send-from-GRF. */
1564 offset = src_reg(this, glsl_type::int_type);
1565 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1566 } else {
1567 /* Immediates are fine on older generations since they'll be moved
1568 * to a (potentially fake) MRF at the generator level.
1569 */
1570 offset = src_reg(const_offset / 16);
1571 }
1572 } else {
1573 offset = src_reg(this, glsl_type::uint_type);
1574 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1575 }
1576
1577 if (brw->gen >= 7) {
1578 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1579 grf_offset.type = offset.type;
1580
1581 emit(MOV(grf_offset, offset));
1582
1583 emit(new(mem_ctx) vec4_instruction(this,
1584 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1585 dst_reg(packed_consts),
1586 surf_index,
1587 src_reg(grf_offset)));
1588 } else {
1589 vec4_instruction *pull =
1590 emit(new(mem_ctx) vec4_instruction(this,
1591 VS_OPCODE_PULL_CONSTANT_LOAD,
1592 dst_reg(packed_consts),
1593 surf_index,
1594 offset));
1595 pull->base_mrf = 14;
1596 pull->mlen = 1;
1597 }
1598
1599 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1600 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1601 const_offset % 16 / 4,
1602 const_offset % 16 / 4,
1603 const_offset % 16 / 4);
1604
1605 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1606 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1607 emit(CMP(result_dst, packed_consts, src_reg(0u),
1608 BRW_CONDITIONAL_NZ));
1609 emit(AND(result_dst, result, src_reg(0x1)));
1610 } else {
1611 emit(MOV(result_dst, packed_consts));
1612 }
1613 break;
1614 }
1615
1616 case ir_binop_vector_extract:
1617 assert(!"should have been lowered by vec_index_to_cond_assign");
1618 break;
1619
1620 case ir_triop_fma:
1621 op[0] = fix_3src_operand(op[0]);
1622 op[1] = fix_3src_operand(op[1]);
1623 op[2] = fix_3src_operand(op[2]);
1624 /* Note that the instruction's argument order is reversed from GLSL
1625 * and the IR.
1626 */
1627 emit(MAD(result_dst, op[2], op[1], op[0]));
1628 break;
1629
1630 case ir_triop_lrp:
1631 op[0] = fix_3src_operand(op[0]);
1632 op[1] = fix_3src_operand(op[1]);
1633 op[2] = fix_3src_operand(op[2]);
1634 /* Note that the instruction's argument order is reversed from GLSL
1635 * and the IR.
1636 */
1637 emit(LRP(result_dst, op[2], op[1], op[0]));
1638 break;
1639
1640 case ir_triop_csel:
1641 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1642 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1643 inst->predicate = BRW_PREDICATE_NORMAL;
1644 break;
1645
1646 case ir_triop_bfi:
1647 op[0] = fix_3src_operand(op[0]);
1648 op[1] = fix_3src_operand(op[1]);
1649 op[2] = fix_3src_operand(op[2]);
1650 emit(BFI2(result_dst, op[0], op[1], op[2]));
1651 break;
1652
1653 case ir_triop_bitfield_extract:
1654 op[0] = fix_3src_operand(op[0]);
1655 op[1] = fix_3src_operand(op[1]);
1656 op[2] = fix_3src_operand(op[2]);
1657 /* Note that the instruction's argument order is reversed from GLSL
1658 * and the IR.
1659 */
1660 emit(BFE(result_dst, op[2], op[1], op[0]));
1661 break;
1662
1663 case ir_triop_vector_insert:
1664 assert(!"should have been lowered by lower_vector_insert");
1665 break;
1666
1667 case ir_quadop_bitfield_insert:
1668 assert(!"not reached: should be handled by "
1669 "bitfield_insert_to_bfm_bfi\n");
1670 break;
1671
1672 case ir_quadop_vector:
1673 assert(!"not reached: should be handled by lower_quadop_vector");
1674 break;
1675
1676 case ir_unop_pack_half_2x16:
1677 emit_pack_half_2x16(result_dst, op[0]);
1678 break;
1679 case ir_unop_unpack_half_2x16:
1680 emit_unpack_half_2x16(result_dst, op[0]);
1681 break;
1682 case ir_unop_pack_snorm_2x16:
1683 case ir_unop_pack_snorm_4x8:
1684 case ir_unop_pack_unorm_2x16:
1685 case ir_unop_pack_unorm_4x8:
1686 case ir_unop_unpack_snorm_2x16:
1687 case ir_unop_unpack_snorm_4x8:
1688 case ir_unop_unpack_unorm_2x16:
1689 case ir_unop_unpack_unorm_4x8:
1690 assert(!"not reached: should be handled by lower_packing_builtins");
1691 break;
1692 case ir_unop_unpack_half_2x16_split_x:
1693 case ir_unop_unpack_half_2x16_split_y:
1694 case ir_binop_pack_half_2x16_split:
1695 assert(!"not reached: should not occur in vertex shader");
1696 break;
1697 case ir_binop_ldexp:
1698 assert(!"not reached: should be handled by ldexp_to_arith()");
1699 break;
1700 }
1701 }
1702
1703
1704 void
1705 vec4_visitor::visit(ir_swizzle *ir)
1706 {
1707 src_reg src;
1708 int i = 0;
1709 int swizzle[4];
1710
1711 /* Note that this is only swizzles in expressions, not those on the left
1712 * hand side of an assignment, which do write masking. See ir_assignment
1713 * for that.
1714 */
1715
1716 ir->val->accept(this);
1717 src = this->result;
1718 assert(src.file != BAD_FILE);
1719
1720 for (i = 0; i < ir->type->vector_elements; i++) {
1721 switch (i) {
1722 case 0:
1723 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1724 break;
1725 case 1:
1726 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1727 break;
1728 case 2:
1729 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1730 break;
1731 case 3:
1732 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1733 break;
1734 }
1735 }
1736 for (; i < 4; i++) {
1737 /* Replicate the last channel out. */
1738 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1739 }
1740
1741 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1742
1743 this->result = src;
1744 }
1745
1746 void
1747 vec4_visitor::visit(ir_dereference_variable *ir)
1748 {
1749 const struct glsl_type *type = ir->type;
1750 dst_reg *reg = variable_storage(ir->var);
1751
1752 if (!reg) {
1753 fail("Failed to find variable storage for %s\n", ir->var->name);
1754 this->result = src_reg(brw_null_reg());
1755 return;
1756 }
1757
1758 this->result = src_reg(*reg);
1759
1760 /* System values get their swizzle from the dst_reg writemask */
1761 if (ir->var->data.mode == ir_var_system_value)
1762 return;
1763
1764 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1765 this->result.swizzle = swizzle_for_size(type->vector_elements);
1766 }
1767
1768
1769 int
1770 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1771 {
1772 /* Under normal circumstances array elements are stored consecutively, so
1773 * the stride is equal to the size of the array element.
1774 */
1775 return type_size(ir->type);
1776 }
1777
1778
1779 void
1780 vec4_visitor::visit(ir_dereference_array *ir)
1781 {
1782 ir_constant *constant_index;
1783 src_reg src;
1784 int array_stride = compute_array_stride(ir);
1785
1786 constant_index = ir->array_index->constant_expression_value();
1787
1788 ir->array->accept(this);
1789 src = this->result;
1790
1791 if (constant_index) {
1792 src.reg_offset += constant_index->value.i[0] * array_stride;
1793 } else {
1794 /* Variable index array dereference. It eats the "vec4" of the
1795 * base of the array and an index that offsets the Mesa register
1796 * index.
1797 */
1798 ir->array_index->accept(this);
1799
1800 src_reg index_reg;
1801
1802 if (array_stride == 1) {
1803 index_reg = this->result;
1804 } else {
1805 index_reg = src_reg(this, glsl_type::int_type);
1806
1807 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1808 }
1809
1810 if (src.reladdr) {
1811 src_reg temp = src_reg(this, glsl_type::int_type);
1812
1813 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1814
1815 index_reg = temp;
1816 }
1817
1818 src.reladdr = ralloc(mem_ctx, src_reg);
1819 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1820 }
1821
1822 /* If the type is smaller than a vec4, replicate the last channel out. */
1823 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1824 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1825 else
1826 src.swizzle = BRW_SWIZZLE_NOOP;
1827 src.type = brw_type_for_base_type(ir->type);
1828
1829 this->result = src;
1830 }
1831
1832 void
1833 vec4_visitor::visit(ir_dereference_record *ir)
1834 {
1835 unsigned int i;
1836 const glsl_type *struct_type = ir->record->type;
1837 int offset = 0;
1838
1839 ir->record->accept(this);
1840
1841 for (i = 0; i < struct_type->length; i++) {
1842 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1843 break;
1844 offset += type_size(struct_type->fields.structure[i].type);
1845 }
1846
1847 /* If the type is smaller than a vec4, replicate the last channel out. */
1848 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1849 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1850 else
1851 this->result.swizzle = BRW_SWIZZLE_NOOP;
1852 this->result.type = brw_type_for_base_type(ir->type);
1853
1854 this->result.reg_offset += offset;
1855 }
1856
1857 /**
1858 * We want to be careful in assignment setup to hit the actual storage
1859 * instead of potentially using a temporary like we might with the
1860 * ir_dereference handler.
1861 */
1862 static dst_reg
1863 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1864 {
1865 /* The LHS must be a dereference. If the LHS is a variable indexed array
1866 * access of a vector, it must be separated into a series conditional moves
1867 * before reaching this point (see ir_vec_index_to_cond_assign).
1868 */
1869 assert(ir->as_dereference());
1870 ir_dereference_array *deref_array = ir->as_dereference_array();
1871 if (deref_array) {
1872 assert(!deref_array->array->type->is_vector());
1873 }
1874
1875 /* Use the rvalue deref handler for the most part. We'll ignore
1876 * swizzles in it and write swizzles using writemask, though.
1877 */
1878 ir->accept(v);
1879 return dst_reg(v->result);
1880 }
1881
1882 void
1883 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1884 const struct glsl_type *type, uint32_t predicate)
1885 {
1886 if (type->base_type == GLSL_TYPE_STRUCT) {
1887 for (unsigned int i = 0; i < type->length; i++) {
1888 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1889 }
1890 return;
1891 }
1892
1893 if (type->is_array()) {
1894 for (unsigned int i = 0; i < type->length; i++) {
1895 emit_block_move(dst, src, type->fields.array, predicate);
1896 }
1897 return;
1898 }
1899
1900 if (type->is_matrix()) {
1901 const struct glsl_type *vec_type;
1902
1903 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1904 type->vector_elements, 1);
1905
1906 for (int i = 0; i < type->matrix_columns; i++) {
1907 emit_block_move(dst, src, vec_type, predicate);
1908 }
1909 return;
1910 }
1911
1912 assert(type->is_scalar() || type->is_vector());
1913
1914 dst->type = brw_type_for_base_type(type);
1915 src->type = dst->type;
1916
1917 dst->writemask = (1 << type->vector_elements) - 1;
1918
1919 src->swizzle = swizzle_for_size(type->vector_elements);
1920
1921 vec4_instruction *inst = emit(MOV(*dst, *src));
1922 inst->predicate = predicate;
1923
1924 dst->reg_offset++;
1925 src->reg_offset++;
1926 }
1927
1928
1929 /* If the RHS processing resulted in an instruction generating a
1930 * temporary value, and it would be easy to rewrite the instruction to
1931 * generate its result right into the LHS instead, do so. This ends
1932 * up reliably removing instructions where it can be tricky to do so
1933 * later without real UD chain information.
1934 */
1935 bool
1936 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1937 dst_reg dst,
1938 src_reg src,
1939 vec4_instruction *pre_rhs_inst,
1940 vec4_instruction *last_rhs_inst)
1941 {
1942 /* This could be supported, but it would take more smarts. */
1943 if (ir->condition)
1944 return false;
1945
1946 if (pre_rhs_inst == last_rhs_inst)
1947 return false; /* No instructions generated to work with. */
1948
1949 /* Make sure the last instruction generated our source reg. */
1950 if (src.file != GRF ||
1951 src.file != last_rhs_inst->dst.file ||
1952 src.reg != last_rhs_inst->dst.reg ||
1953 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1954 src.reladdr ||
1955 src.abs ||
1956 src.negate ||
1957 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1958 return false;
1959
1960 /* Check that that last instruction fully initialized the channels
1961 * we want to use, in the order we want to use them. We could
1962 * potentially reswizzle the operands of many instructions so that
1963 * we could handle out of order channels, but don't yet.
1964 */
1965
1966 for (unsigned i = 0; i < 4; i++) {
1967 if (dst.writemask & (1 << i)) {
1968 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1969 return false;
1970
1971 if (BRW_GET_SWZ(src.swizzle, i) != i)
1972 return false;
1973 }
1974 }
1975
1976 /* Success! Rewrite the instruction. */
1977 last_rhs_inst->dst.file = dst.file;
1978 last_rhs_inst->dst.reg = dst.reg;
1979 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1980 last_rhs_inst->dst.reladdr = dst.reladdr;
1981 last_rhs_inst->dst.writemask &= dst.writemask;
1982
1983 return true;
1984 }
1985
1986 void
1987 vec4_visitor::visit(ir_assignment *ir)
1988 {
1989 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1990 uint32_t predicate = BRW_PREDICATE_NONE;
1991
1992 if (!ir->lhs->type->is_scalar() &&
1993 !ir->lhs->type->is_vector()) {
1994 ir->rhs->accept(this);
1995 src_reg src = this->result;
1996
1997 if (ir->condition) {
1998 emit_bool_to_cond_code(ir->condition, &predicate);
1999 }
2000
2001 /* emit_block_move doesn't account for swizzles in the source register.
2002 * This should be ok, since the source register is a structure or an
2003 * array, and those can't be swizzled. But double-check to be sure.
2004 */
2005 assert(src.swizzle ==
2006 (ir->rhs->type->is_matrix()
2007 ? swizzle_for_size(ir->rhs->type->vector_elements)
2008 : BRW_SWIZZLE_NOOP));
2009
2010 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2011 return;
2012 }
2013
2014 /* Now we're down to just a scalar/vector with writemasks. */
2015 int i;
2016
2017 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2018 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2019
2020 ir->rhs->accept(this);
2021
2022 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2023
2024 src_reg src = this->result;
2025
2026 int swizzles[4];
2027 int first_enabled_chan = 0;
2028 int src_chan = 0;
2029
2030 assert(ir->lhs->type->is_vector() ||
2031 ir->lhs->type->is_scalar());
2032 dst.writemask = ir->write_mask;
2033
2034 for (int i = 0; i < 4; i++) {
2035 if (dst.writemask & (1 << i)) {
2036 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2037 break;
2038 }
2039 }
2040
2041 /* Swizzle a small RHS vector into the channels being written.
2042 *
2043 * glsl ir treats write_mask as dictating how many channels are
2044 * present on the RHS while in our instructions we need to make
2045 * those channels appear in the slots of the vec4 they're written to.
2046 */
2047 for (int i = 0; i < 4; i++) {
2048 if (dst.writemask & (1 << i))
2049 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2050 else
2051 swizzles[i] = first_enabled_chan;
2052 }
2053 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2054 swizzles[2], swizzles[3]);
2055
2056 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2057 return;
2058 }
2059
2060 if (ir->condition) {
2061 emit_bool_to_cond_code(ir->condition, &predicate);
2062 }
2063
2064 for (i = 0; i < type_size(ir->lhs->type); i++) {
2065 vec4_instruction *inst = emit(MOV(dst, src));
2066 inst->predicate = predicate;
2067
2068 dst.reg_offset++;
2069 src.reg_offset++;
2070 }
2071 }
2072
2073 void
2074 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2075 {
2076 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2077 foreach_list(node, &ir->components) {
2078 ir_constant *field_value = (ir_constant *)node;
2079
2080 emit_constant_values(dst, field_value);
2081 }
2082 return;
2083 }
2084
2085 if (ir->type->is_array()) {
2086 for (unsigned int i = 0; i < ir->type->length; i++) {
2087 emit_constant_values(dst, ir->array_elements[i]);
2088 }
2089 return;
2090 }
2091
2092 if (ir->type->is_matrix()) {
2093 for (int i = 0; i < ir->type->matrix_columns; i++) {
2094 float *vec = &ir->value.f[i * ir->type->vector_elements];
2095
2096 for (int j = 0; j < ir->type->vector_elements; j++) {
2097 dst->writemask = 1 << j;
2098 dst->type = BRW_REGISTER_TYPE_F;
2099
2100 emit(MOV(*dst, src_reg(vec[j])));
2101 }
2102 dst->reg_offset++;
2103 }
2104 return;
2105 }
2106
2107 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2108
2109 for (int i = 0; i < ir->type->vector_elements; i++) {
2110 if (!(remaining_writemask & (1 << i)))
2111 continue;
2112
2113 dst->writemask = 1 << i;
2114 dst->type = brw_type_for_base_type(ir->type);
2115
2116 /* Find other components that match the one we're about to
2117 * write. Emits fewer instructions for things like vec4(0.5,
2118 * 1.5, 1.5, 1.5).
2119 */
2120 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2121 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2122 if (ir->value.b[i] == ir->value.b[j])
2123 dst->writemask |= (1 << j);
2124 } else {
2125 /* u, i, and f storage all line up, so no need for a
2126 * switch case for comparing each type.
2127 */
2128 if (ir->value.u[i] == ir->value.u[j])
2129 dst->writemask |= (1 << j);
2130 }
2131 }
2132
2133 switch (ir->type->base_type) {
2134 case GLSL_TYPE_FLOAT:
2135 emit(MOV(*dst, src_reg(ir->value.f[i])));
2136 break;
2137 case GLSL_TYPE_INT:
2138 emit(MOV(*dst, src_reg(ir->value.i[i])));
2139 break;
2140 case GLSL_TYPE_UINT:
2141 emit(MOV(*dst, src_reg(ir->value.u[i])));
2142 break;
2143 case GLSL_TYPE_BOOL:
2144 emit(MOV(*dst, src_reg(ir->value.b[i])));
2145 break;
2146 default:
2147 assert(!"Non-float/uint/int/bool constant");
2148 break;
2149 }
2150
2151 remaining_writemask &= ~dst->writemask;
2152 }
2153 dst->reg_offset++;
2154 }
2155
2156 void
2157 vec4_visitor::visit(ir_constant *ir)
2158 {
2159 dst_reg dst = dst_reg(this, ir->type);
2160 this->result = src_reg(dst);
2161
2162 emit_constant_values(&dst, ir);
2163 }
2164
2165 void
2166 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2167 {
2168 ir_dereference *deref = static_cast<ir_dereference *>(
2169 ir->actual_parameters.get_head());
2170 ir_variable *location = deref->variable_referenced();
2171 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2172 location->data.atomic.buffer_index);
2173
2174 /* Calculate the surface offset */
2175 src_reg offset(this, glsl_type::uint_type);
2176 ir_dereference_array *deref_array = deref->as_dereference_array();
2177 if (deref_array) {
2178 deref_array->array_index->accept(this);
2179
2180 src_reg tmp(this, glsl_type::uint_type);
2181 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2182 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2183 } else {
2184 offset = location->data.atomic.offset;
2185 }
2186
2187 /* Emit the appropriate machine instruction */
2188 const char *callee = ir->callee->function_name();
2189 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2190
2191 if (!strcmp("__intrinsic_atomic_read", callee)) {
2192 emit_untyped_surface_read(surf_index, dst, offset);
2193
2194 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2195 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2196 src_reg(), src_reg());
2197
2198 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2199 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2200 src_reg(), src_reg());
2201 }
2202 }
2203
2204 void
2205 vec4_visitor::visit(ir_call *ir)
2206 {
2207 const char *callee = ir->callee->function_name();
2208
2209 if (!strcmp("__intrinsic_atomic_read", callee) ||
2210 !strcmp("__intrinsic_atomic_increment", callee) ||
2211 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2212 visit_atomic_counter_intrinsic(ir);
2213 } else {
2214 assert(!"Unsupported intrinsic.");
2215 }
2216 }
2217
2218 src_reg
2219 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2220 {
2221 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2222 inst->base_mrf = 2;
2223 inst->mlen = 1;
2224 inst->sampler = sampler;
2225 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2226 inst->dst.writemask = WRITEMASK_XYZW;
2227
2228 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2229 int param_base = inst->base_mrf;
2230 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2231 int zero_mask = 0xf & ~coord_mask;
2232
2233 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2234 coordinate));
2235
2236 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2237 src_reg(0)));
2238
2239 emit(inst);
2240 return src_reg(inst->dst);
2241 }
2242
2243 void
2244 vec4_visitor::visit(ir_texture *ir)
2245 {
2246 int sampler =
2247 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2248
2249 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2250 * emitting anything other than setting up the constant result.
2251 */
2252 if (ir->op == ir_tg4) {
2253 ir_constant *chan = ir->lod_info.component->as_constant();
2254 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2255 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2256 dst_reg result(this, ir->type);
2257 this->result = src_reg(result);
2258 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2259 return;
2260 }
2261 }
2262
2263 /* Should be lowered by do_lower_texture_projection */
2264 assert(!ir->projector);
2265
2266 /* Should be lowered */
2267 assert(!ir->offset || !ir->offset->type->is_array());
2268
2269 /* Generate code to compute all the subexpression trees. This has to be
2270 * done before loading any values into MRFs for the sampler message since
2271 * generating these values may involve SEND messages that need the MRFs.
2272 */
2273 src_reg coordinate;
2274 if (ir->coordinate) {
2275 ir->coordinate->accept(this);
2276 coordinate = this->result;
2277 }
2278
2279 src_reg shadow_comparitor;
2280 if (ir->shadow_comparitor) {
2281 ir->shadow_comparitor->accept(this);
2282 shadow_comparitor = this->result;
2283 }
2284
2285 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2286 src_reg offset_value;
2287 if (has_nonconstant_offset) {
2288 ir->offset->accept(this);
2289 offset_value = src_reg(this->result);
2290 }
2291
2292 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2293 src_reg lod, dPdx, dPdy, sample_index, mcs;
2294 switch (ir->op) {
2295 case ir_tex:
2296 lod = src_reg(0.0f);
2297 lod_type = glsl_type::float_type;
2298 break;
2299 case ir_txf:
2300 case ir_txl:
2301 case ir_txs:
2302 ir->lod_info.lod->accept(this);
2303 lod = this->result;
2304 lod_type = ir->lod_info.lod->type;
2305 break;
2306 case ir_query_levels:
2307 lod = src_reg(0);
2308 lod_type = glsl_type::int_type;
2309 break;
2310 case ir_txf_ms:
2311 ir->lod_info.sample_index->accept(this);
2312 sample_index = this->result;
2313 sample_index_type = ir->lod_info.sample_index->type;
2314
2315 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2316 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2317 else
2318 mcs = src_reg(0u);
2319 break;
2320 case ir_txd:
2321 ir->lod_info.grad.dPdx->accept(this);
2322 dPdx = this->result;
2323
2324 ir->lod_info.grad.dPdy->accept(this);
2325 dPdy = this->result;
2326
2327 lod_type = ir->lod_info.grad.dPdx->type;
2328 break;
2329 case ir_txb:
2330 case ir_lod:
2331 case ir_tg4:
2332 break;
2333 }
2334
2335 vec4_instruction *inst = NULL;
2336 switch (ir->op) {
2337 case ir_tex:
2338 case ir_txl:
2339 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2340 break;
2341 case ir_txd:
2342 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2343 break;
2344 case ir_txf:
2345 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2346 break;
2347 case ir_txf_ms:
2348 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2349 break;
2350 case ir_txs:
2351 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2352 break;
2353 case ir_tg4:
2354 if (has_nonconstant_offset)
2355 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2356 else
2357 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2358 break;
2359 case ir_query_levels:
2360 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2361 break;
2362 case ir_txb:
2363 assert(!"TXB is not valid for vertex shaders.");
2364 break;
2365 case ir_lod:
2366 assert(!"LOD is not valid for vertex shaders.");
2367 break;
2368 default:
2369 assert(!"Unrecognized tex op");
2370 }
2371
2372 if (ir->offset != NULL && ir->op != ir_txf)
2373 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2374
2375 /* Stuff the channel select bits in the top of the texture offset */
2376 if (ir->op == ir_tg4)
2377 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2378
2379 /* The message header is necessary for:
2380 * - Gen4 (always)
2381 * - Texel offsets
2382 * - Gather channel selection
2383 * - Sampler indices too large to fit in a 4-bit value.
2384 */
2385 inst->header_present =
2386 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2387 sampler >= 16;
2388 inst->base_mrf = 2;
2389 inst->mlen = inst->header_present + 1; /* always at least one */
2390 inst->sampler = sampler;
2391 inst->dst = dst_reg(this, ir->type);
2392 inst->dst.writemask = WRITEMASK_XYZW;
2393 inst->shadow_compare = ir->shadow_comparitor != NULL;
2394
2395 /* MRF for the first parameter */
2396 int param_base = inst->base_mrf + inst->header_present;
2397
2398 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2399 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2400 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2401 } else {
2402 /* Load the coordinate */
2403 /* FINISHME: gl_clamp_mask and saturate */
2404 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2405 int zero_mask = 0xf & ~coord_mask;
2406
2407 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2408 coordinate));
2409
2410 if (zero_mask != 0) {
2411 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2412 src_reg(0)));
2413 }
2414 /* Load the shadow comparitor */
2415 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2416 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2417 WRITEMASK_X),
2418 shadow_comparitor));
2419 inst->mlen++;
2420 }
2421
2422 /* Load the LOD info */
2423 if (ir->op == ir_tex || ir->op == ir_txl) {
2424 int mrf, writemask;
2425 if (brw->gen >= 5) {
2426 mrf = param_base + 1;
2427 if (ir->shadow_comparitor) {
2428 writemask = WRITEMASK_Y;
2429 /* mlen already incremented */
2430 } else {
2431 writemask = WRITEMASK_X;
2432 inst->mlen++;
2433 }
2434 } else /* brw->gen == 4 */ {
2435 mrf = param_base;
2436 writemask = WRITEMASK_W;
2437 }
2438 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2439 } else if (ir->op == ir_txf) {
2440 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2441 } else if (ir->op == ir_txf_ms) {
2442 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2443 sample_index));
2444 if (brw->gen >= 7)
2445 /* MCS data is in the first channel of `mcs`, but we need to get it into
2446 * the .y channel of the second vec4 of params, so replicate .x across
2447 * the whole vec4 and then mask off everything except .y
2448 */
2449 mcs.swizzle = BRW_SWIZZLE_XXXX;
2450 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2451 mcs));
2452 inst->mlen++;
2453 } else if (ir->op == ir_txd) {
2454 const glsl_type *type = lod_type;
2455
2456 if (brw->gen >= 5) {
2457 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2458 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2459 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2460 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2461 inst->mlen++;
2462
2463 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2464 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2465 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2466 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2467 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2468 inst->mlen++;
2469
2470 if (ir->shadow_comparitor) {
2471 emit(MOV(dst_reg(MRF, param_base + 2,
2472 ir->shadow_comparitor->type, WRITEMASK_Z),
2473 shadow_comparitor));
2474 }
2475 }
2476 } else /* brw->gen == 4 */ {
2477 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2478 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2479 inst->mlen += 2;
2480 }
2481 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2482 if (ir->shadow_comparitor) {
2483 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2484 shadow_comparitor));
2485 }
2486
2487 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2488 offset_value));
2489 inst->mlen++;
2490 }
2491 }
2492
2493 emit(inst);
2494
2495 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2496 * spec requires layers.
2497 */
2498 if (ir->op == ir_txs) {
2499 glsl_type const *type = ir->sampler->type;
2500 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2501 type->sampler_array) {
2502 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2503 writemask(inst->dst, WRITEMASK_Z),
2504 src_reg(inst->dst), src_reg(6));
2505 }
2506 }
2507
2508 if (brw->gen == 6 && ir->op == ir_tg4) {
2509 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2510 }
2511
2512 swizzle_result(ir, src_reg(inst->dst), sampler);
2513 }
2514
2515 /**
2516 * Apply workarounds for Gen6 gather with UINT/SINT
2517 */
2518 void
2519 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2520 {
2521 if (!wa)
2522 return;
2523
2524 int width = (wa & WA_8BIT) ? 8 : 16;
2525 dst_reg dst_f = dst;
2526 dst_f.type = BRW_REGISTER_TYPE_F;
2527
2528 /* Convert from UNORM to UINT */
2529 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2530 emit(MOV(dst, src_reg(dst_f)));
2531
2532 if (wa & WA_SIGN) {
2533 /* Reinterpret the UINT value as a signed INT value by
2534 * shifting the sign bit into place, then shifting back
2535 * preserving sign.
2536 */
2537 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2538 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2539 }
2540 }
2541
2542 /**
2543 * Set up the gather channel based on the swizzle, for gather4.
2544 */
2545 uint32_t
2546 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2547 {
2548 ir_constant *chan = ir->lod_info.component->as_constant();
2549 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2550 switch (swiz) {
2551 case SWIZZLE_X: return 0;
2552 case SWIZZLE_Y:
2553 /* gather4 sampler is broken for green channel on RG32F --
2554 * we must ask for blue instead.
2555 */
2556 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2557 return 2;
2558 return 1;
2559 case SWIZZLE_Z: return 2;
2560 case SWIZZLE_W: return 3;
2561 default:
2562 assert(!"Not reached"); /* zero, one swizzles handled already */
2563 return 0;
2564 }
2565 }
2566
2567 void
2568 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2569 {
2570 int s = key->tex.swizzles[sampler];
2571
2572 this->result = src_reg(this, ir->type);
2573 dst_reg swizzled_result(this->result);
2574
2575 if (ir->op == ir_query_levels) {
2576 /* # levels is in .w */
2577 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2578 emit(MOV(swizzled_result, orig_val));
2579 return;
2580 }
2581
2582 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2583 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2584 emit(MOV(swizzled_result, orig_val));
2585 return;
2586 }
2587
2588
2589 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2590 int swizzle[4] = {0};
2591
2592 for (int i = 0; i < 4; i++) {
2593 switch (GET_SWZ(s, i)) {
2594 case SWIZZLE_ZERO:
2595 zero_mask |= (1 << i);
2596 break;
2597 case SWIZZLE_ONE:
2598 one_mask |= (1 << i);
2599 break;
2600 default:
2601 copy_mask |= (1 << i);
2602 swizzle[i] = GET_SWZ(s, i);
2603 break;
2604 }
2605 }
2606
2607 if (copy_mask) {
2608 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2609 swizzled_result.writemask = copy_mask;
2610 emit(MOV(swizzled_result, orig_val));
2611 }
2612
2613 if (zero_mask) {
2614 swizzled_result.writemask = zero_mask;
2615 emit(MOV(swizzled_result, src_reg(0.0f)));
2616 }
2617
2618 if (one_mask) {
2619 swizzled_result.writemask = one_mask;
2620 emit(MOV(swizzled_result, src_reg(1.0f)));
2621 }
2622 }
2623
2624 void
2625 vec4_visitor::visit(ir_return *ir)
2626 {
2627 assert(!"not reached");
2628 }
2629
2630 void
2631 vec4_visitor::visit(ir_discard *ir)
2632 {
2633 assert(!"not reached");
2634 }
2635
2636 void
2637 vec4_visitor::visit(ir_if *ir)
2638 {
2639 /* Don't point the annotation at the if statement, because then it plus
2640 * the then and else blocks get printed.
2641 */
2642 this->base_ir = ir->condition;
2643
2644 if (brw->gen == 6) {
2645 emit_if_gen6(ir);
2646 } else {
2647 uint32_t predicate;
2648 emit_bool_to_cond_code(ir->condition, &predicate);
2649 emit(IF(predicate));
2650 }
2651
2652 visit_instructions(&ir->then_instructions);
2653
2654 if (!ir->else_instructions.is_empty()) {
2655 this->base_ir = ir->condition;
2656 emit(BRW_OPCODE_ELSE);
2657
2658 visit_instructions(&ir->else_instructions);
2659 }
2660
2661 this->base_ir = ir->condition;
2662 emit(BRW_OPCODE_ENDIF);
2663 }
2664
2665 void
2666 vec4_visitor::visit(ir_emit_vertex *)
2667 {
2668 assert(!"not reached");
2669 }
2670
2671 void
2672 vec4_visitor::visit(ir_end_primitive *)
2673 {
2674 assert(!"not reached");
2675 }
2676
2677 void
2678 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2679 dst_reg dst, src_reg offset,
2680 src_reg src0, src_reg src1)
2681 {
2682 unsigned mlen = 0;
2683
2684 /* Set the atomic operation offset. */
2685 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2686 mlen++;
2687
2688 /* Set the atomic operation arguments. */
2689 if (src0.file != BAD_FILE) {
2690 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2691 mlen++;
2692 }
2693
2694 if (src1.file != BAD_FILE) {
2695 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2696 mlen++;
2697 }
2698
2699 /* Emit the instruction. Note that this maps to the normal SIMD8
2700 * untyped atomic message on Ivy Bridge, but that's OK because
2701 * unused channels will be masked out.
2702 */
2703 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2704 src_reg(atomic_op), src_reg(surf_index));
2705 inst->base_mrf = 0;
2706 inst->mlen = mlen;
2707 }
2708
2709 void
2710 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2711 src_reg offset)
2712 {
2713 /* Set the surface read offset. */
2714 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2715
2716 /* Emit the instruction. Note that this maps to the normal SIMD8
2717 * untyped surface read message, but that's OK because unused
2718 * channels will be masked out.
2719 */
2720 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2721 dst, src_reg(surf_index));
2722 inst->base_mrf = 0;
2723 inst->mlen = 1;
2724 }
2725
2726 void
2727 vec4_visitor::emit_ndc_computation()
2728 {
2729 /* Get the position */
2730 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2731
2732 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2733 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2734 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2735
2736 current_annotation = "NDC";
2737 dst_reg ndc_w = ndc;
2738 ndc_w.writemask = WRITEMASK_W;
2739 src_reg pos_w = pos;
2740 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2741 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2742
2743 dst_reg ndc_xyz = ndc;
2744 ndc_xyz.writemask = WRITEMASK_XYZ;
2745
2746 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2747 }
2748
2749 void
2750 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2751 {
2752 if (brw->gen < 6 &&
2753 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2754 key->userclip_active || brw->has_negative_rhw_bug)) {
2755 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2756 dst_reg header1_w = header1;
2757 header1_w.writemask = WRITEMASK_W;
2758
2759 emit(MOV(header1, 0u));
2760
2761 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2762 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2763
2764 current_annotation = "Point size";
2765 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2766 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2767 }
2768
2769 if (key->userclip_active) {
2770 current_annotation = "Clipping flags";
2771 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2772 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2773
2774 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2775 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2776 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2777
2778 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2779 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2780 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2781 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2782 }
2783
2784 /* i965 clipping workaround:
2785 * 1) Test for -ve rhw
2786 * 2) If set,
2787 * set ndc = (0,0,0,0)
2788 * set ucp[6] = 1
2789 *
2790 * Later, clipping will detect ucp[6] and ensure the primitive is
2791 * clipped against all fixed planes.
2792 */
2793 if (brw->has_negative_rhw_bug) {
2794 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2795 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2796 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2797 vec4_instruction *inst;
2798 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2799 inst->predicate = BRW_PREDICATE_NORMAL;
2800 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2801 inst->predicate = BRW_PREDICATE_NORMAL;
2802 }
2803
2804 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2805 } else if (brw->gen < 6) {
2806 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2807 } else {
2808 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2809 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2810 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2811 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2812 }
2813 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2814 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2815 src_reg(output_reg[VARYING_SLOT_LAYER])));
2816 }
2817 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2818 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2819 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2820 }
2821 }
2822 }
2823
2824 void
2825 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2826 {
2827 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2828 *
2829 * "If a linked set of shaders forming the vertex stage contains no
2830 * static write to gl_ClipVertex or gl_ClipDistance, but the
2831 * application has requested clipping against user clip planes through
2832 * the API, then the coordinate written to gl_Position is used for
2833 * comparison against the user clip planes."
2834 *
2835 * This function is only called if the shader didn't write to
2836 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2837 * if the user wrote to it; otherwise we use gl_Position.
2838 */
2839 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2840 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2841 clip_vertex = VARYING_SLOT_POS;
2842 }
2843
2844 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2845 ++i) {
2846 reg.writemask = 1 << i;
2847 emit(DP4(reg,
2848 src_reg(output_reg[clip_vertex]),
2849 src_reg(this->userplane[i + offset])));
2850 }
2851 }
2852
2853 void
2854 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2855 {
2856 assert (varying < VARYING_SLOT_MAX);
2857 reg.type = output_reg[varying].type;
2858 current_annotation = output_reg_annotation[varying];
2859 /* Copy the register, saturating if necessary */
2860 vec4_instruction *inst = emit(MOV(reg,
2861 src_reg(output_reg[varying])));
2862 if ((varying == VARYING_SLOT_COL0 ||
2863 varying == VARYING_SLOT_COL1 ||
2864 varying == VARYING_SLOT_BFC0 ||
2865 varying == VARYING_SLOT_BFC1) &&
2866 key->clamp_vertex_color) {
2867 inst->saturate = true;
2868 }
2869 }
2870
2871 void
2872 vec4_visitor::emit_urb_slot(int mrf, int varying)
2873 {
2874 struct brw_reg hw_reg = brw_message_reg(mrf);
2875 dst_reg reg = dst_reg(MRF, mrf);
2876 reg.type = BRW_REGISTER_TYPE_F;
2877
2878 switch (varying) {
2879 case VARYING_SLOT_PSIZ:
2880 /* PSIZ is always in slot 0, and is coupled with other flags. */
2881 current_annotation = "indices, point width, clip flags";
2882 emit_psiz_and_flags(hw_reg);
2883 break;
2884 case BRW_VARYING_SLOT_NDC:
2885 current_annotation = "NDC";
2886 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2887 break;
2888 case VARYING_SLOT_POS:
2889 current_annotation = "gl_Position";
2890 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2891 break;
2892 case VARYING_SLOT_EDGE:
2893 /* This is present when doing unfilled polygons. We're supposed to copy
2894 * the edge flag from the user-provided vertex array
2895 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2896 * of that attribute (starts as 1.0f). This is then used in clipping to
2897 * determine which edges should be drawn as wireframe.
2898 */
2899 current_annotation = "edge flag";
2900 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2901 glsl_type::float_type, WRITEMASK_XYZW))));
2902 break;
2903 case BRW_VARYING_SLOT_PAD:
2904 /* No need to write to this slot */
2905 break;
2906 default:
2907 emit_generic_urb_slot(reg, varying);
2908 break;
2909 }
2910 }
2911
2912 static int
2913 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2914 {
2915 if (brw->gen >= 6) {
2916 /* URB data written (does not include the message header reg) must
2917 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2918 * section 5.4.3.2.2: URB_INTERLEAVED.
2919 *
2920 * URB entries are allocated on a multiple of 1024 bits, so an
2921 * extra 128 bits written here to make the end align to 256 is
2922 * no problem.
2923 */
2924 if ((mlen % 2) != 1)
2925 mlen++;
2926 }
2927
2928 return mlen;
2929 }
2930
2931
2932 /**
2933 * Generates the VUE payload plus the necessary URB write instructions to
2934 * output it.
2935 *
2936 * The VUE layout is documented in Volume 2a.
2937 */
2938 void
2939 vec4_visitor::emit_vertex()
2940 {
2941 /* MRF 0 is reserved for the debugger, so start with message header
2942 * in MRF 1.
2943 */
2944 int base_mrf = 1;
2945 int mrf = base_mrf;
2946 /* In the process of generating our URB write message contents, we
2947 * may need to unspill a register or load from an array. Those
2948 * reads would use MRFs 14-15.
2949 */
2950 int max_usable_mrf = 13;
2951
2952 /* The following assertion verifies that max_usable_mrf causes an
2953 * even-numbered amount of URB write data, which will meet gen6's
2954 * requirements for length alignment.
2955 */
2956 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2957
2958 /* First mrf is the g0-based message header containing URB handles and
2959 * such.
2960 */
2961 emit_urb_write_header(mrf++);
2962
2963 if (brw->gen < 6) {
2964 emit_ndc_computation();
2965 }
2966
2967 /* Lower legacy ff and ClipVertex clipping to clip distances */
2968 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2969 current_annotation = "user clip distances";
2970
2971 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
2972 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
2973
2974 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
2975 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
2976 }
2977
2978 /* We may need to split this up into several URB writes, so do them in a
2979 * loop.
2980 */
2981 int slot = 0;
2982 bool complete = false;
2983 do {
2984 /* URB offset is in URB row increments, and each of our MRFs is half of
2985 * one of those, since we're doing interleaved writes.
2986 */
2987 int offset = slot / 2;
2988
2989 mrf = base_mrf + 1;
2990 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2991 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2992
2993 /* If this was max_usable_mrf, we can't fit anything more into this
2994 * URB WRITE.
2995 */
2996 if (mrf > max_usable_mrf) {
2997 slot++;
2998 break;
2999 }
3000 }
3001
3002 complete = slot >= prog_data->vue_map.num_slots;
3003 current_annotation = "URB write";
3004 vec4_instruction *inst = emit_urb_write_opcode(complete);
3005 inst->base_mrf = base_mrf;
3006 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3007 inst->offset += offset;
3008 } while(!complete);
3009 }
3010
3011
3012 src_reg
3013 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3014 src_reg *reladdr, int reg_offset)
3015 {
3016 /* Because we store the values to scratch interleaved like our
3017 * vertex data, we need to scale the vec4 index by 2.
3018 */
3019 int message_header_scale = 2;
3020
3021 /* Pre-gen6, the message header uses byte offsets instead of vec4
3022 * (16-byte) offset units.
3023 */
3024 if (brw->gen < 6)
3025 message_header_scale *= 16;
3026
3027 if (reladdr) {
3028 src_reg index = src_reg(this, glsl_type::int_type);
3029
3030 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3031 emit_before(inst, MUL(dst_reg(index),
3032 index, src_reg(message_header_scale)));
3033
3034 return index;
3035 } else {
3036 return src_reg(reg_offset * message_header_scale);
3037 }
3038 }
3039
3040 src_reg
3041 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3042 src_reg *reladdr, int reg_offset)
3043 {
3044 if (reladdr) {
3045 src_reg index = src_reg(this, glsl_type::int_type);
3046
3047 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3048
3049 /* Pre-gen6, the message header uses byte offsets instead of vec4
3050 * (16-byte) offset units.
3051 */
3052 if (brw->gen < 6) {
3053 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3054 }
3055
3056 return index;
3057 } else if (brw->gen >= 8) {
3058 /* Store the offset in a GRF so we can send-from-GRF. */
3059 src_reg offset = src_reg(this, glsl_type::int_type);
3060 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3061 return offset;
3062 } else {
3063 int message_header_scale = brw->gen < 6 ? 16 : 1;
3064 return src_reg(reg_offset * message_header_scale);
3065 }
3066 }
3067
3068 /**
3069 * Emits an instruction before @inst to load the value named by @orig_src
3070 * from scratch space at @base_offset to @temp.
3071 *
3072 * @base_offset is measured in 32-byte units (the size of a register).
3073 */
3074 void
3075 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3076 dst_reg temp, src_reg orig_src,
3077 int base_offset)
3078 {
3079 int reg_offset = base_offset + orig_src.reg_offset;
3080 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3081
3082 emit_before(inst, SCRATCH_READ(temp, index));
3083 }
3084
3085 /**
3086 * Emits an instruction after @inst to store the value to be written
3087 * to @orig_dst to scratch space at @base_offset, from @temp.
3088 *
3089 * @base_offset is measured in 32-byte units (the size of a register).
3090 */
3091 void
3092 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3093 {
3094 int reg_offset = base_offset + inst->dst.reg_offset;
3095 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3096
3097 /* Create a temporary register to store *inst's result in.
3098 *
3099 * We have to be careful in MOVing from our temporary result register in
3100 * the scratch write. If we swizzle from channels of the temporary that
3101 * weren't initialized, it will confuse live interval analysis, which will
3102 * make spilling fail to make progress.
3103 */
3104 src_reg temp = src_reg(this, glsl_type::vec4_type);
3105 temp.type = inst->dst.type;
3106 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3107 int swizzles[4];
3108 for (int i = 0; i < 4; i++)
3109 if (inst->dst.writemask & (1 << i))
3110 swizzles[i] = i;
3111 else
3112 swizzles[i] = first_writemask_chan;
3113 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3114 swizzles[2], swizzles[3]);
3115
3116 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3117 inst->dst.writemask));
3118 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3119 write->predicate = inst->predicate;
3120 write->ir = inst->ir;
3121 write->annotation = inst->annotation;
3122 inst->insert_after(write);
3123
3124 inst->dst.file = temp.file;
3125 inst->dst.reg = temp.reg;
3126 inst->dst.reg_offset = temp.reg_offset;
3127 inst->dst.reladdr = NULL;
3128 }
3129
3130 /**
3131 * We can't generally support array access in GRF space, because a
3132 * single instruction's destination can only span 2 contiguous
3133 * registers. So, we send all GRF arrays that get variable index
3134 * access to scratch space.
3135 */
3136 void
3137 vec4_visitor::move_grf_array_access_to_scratch()
3138 {
3139 int scratch_loc[this->virtual_grf_count];
3140
3141 for (int i = 0; i < this->virtual_grf_count; i++) {
3142 scratch_loc[i] = -1;
3143 }
3144
3145 /* First, calculate the set of virtual GRFs that need to be punted
3146 * to scratch due to having any array access on them, and where in
3147 * scratch.
3148 */
3149 foreach_list(node, &this->instructions) {
3150 vec4_instruction *inst = (vec4_instruction *)node;
3151
3152 if (inst->dst.file == GRF && inst->dst.reladdr &&
3153 scratch_loc[inst->dst.reg] == -1) {
3154 scratch_loc[inst->dst.reg] = c->last_scratch;
3155 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3156 }
3157
3158 for (int i = 0 ; i < 3; i++) {
3159 src_reg *src = &inst->src[i];
3160
3161 if (src->file == GRF && src->reladdr &&
3162 scratch_loc[src->reg] == -1) {
3163 scratch_loc[src->reg] = c->last_scratch;
3164 c->last_scratch += this->virtual_grf_sizes[src->reg];
3165 }
3166 }
3167 }
3168
3169 /* Now, for anything that will be accessed through scratch, rewrite
3170 * it to load/store. Note that this is a _safe list walk, because
3171 * we may generate a new scratch_write instruction after the one
3172 * we're processing.
3173 */
3174 foreach_list_safe(node, &this->instructions) {
3175 vec4_instruction *inst = (vec4_instruction *)node;
3176
3177 /* Set up the annotation tracking for new generated instructions. */
3178 base_ir = inst->ir;
3179 current_annotation = inst->annotation;
3180
3181 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3182 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3183 }
3184
3185 for (int i = 0 ; i < 3; i++) {
3186 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3187 continue;
3188
3189 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3190
3191 emit_scratch_read(inst, temp, inst->src[i],
3192 scratch_loc[inst->src[i].reg]);
3193
3194 inst->src[i].file = temp.file;
3195 inst->src[i].reg = temp.reg;
3196 inst->src[i].reg_offset = temp.reg_offset;
3197 inst->src[i].reladdr = NULL;
3198 }
3199 }
3200 }
3201
3202 /**
3203 * Emits an instruction before @inst to load the value named by @orig_src
3204 * from the pull constant buffer (surface) at @base_offset to @temp.
3205 */
3206 void
3207 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3208 dst_reg temp, src_reg orig_src,
3209 int base_offset)
3210 {
3211 int reg_offset = base_offset + orig_src.reg_offset;
3212 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3213 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3214 vec4_instruction *load;
3215
3216 if (brw->gen >= 7) {
3217 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3218 grf_offset.type = offset.type;
3219 emit_before(inst, MOV(grf_offset, offset));
3220
3221 load = new(mem_ctx) vec4_instruction(this,
3222 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3223 temp, index, src_reg(grf_offset));
3224 } else {
3225 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3226 temp, index, offset);
3227 load->base_mrf = 14;
3228 load->mlen = 1;
3229 }
3230 emit_before(inst, load);
3231 }
3232
3233 /**
3234 * Implements array access of uniforms by inserting a
3235 * PULL_CONSTANT_LOAD instruction.
3236 *
3237 * Unlike temporary GRF array access (where we don't support it due to
3238 * the difficulty of doing relative addressing on instruction
3239 * destinations), we could potentially do array access of uniforms
3240 * that were loaded in GRF space as push constants. In real-world
3241 * usage we've seen, though, the arrays being used are always larger
3242 * than we could load as push constants, so just always move all
3243 * uniform array access out to a pull constant buffer.
3244 */
3245 void
3246 vec4_visitor::move_uniform_array_access_to_pull_constants()
3247 {
3248 int pull_constant_loc[this->uniforms];
3249
3250 for (int i = 0; i < this->uniforms; i++) {
3251 pull_constant_loc[i] = -1;
3252 }
3253
3254 /* Walk through and find array access of uniforms. Put a copy of that
3255 * uniform in the pull constant buffer.
3256 *
3257 * Note that we don't move constant-indexed accesses to arrays. No
3258 * testing has been done of the performance impact of this choice.
3259 */
3260 foreach_list_safe(node, &this->instructions) {
3261 vec4_instruction *inst = (vec4_instruction *)node;
3262
3263 for (int i = 0 ; i < 3; i++) {
3264 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3265 continue;
3266
3267 int uniform = inst->src[i].reg;
3268
3269 /* If this array isn't already present in the pull constant buffer,
3270 * add it.
3271 */
3272 if (pull_constant_loc[uniform] == -1) {
3273 const float **values = &stage_prog_data->param[uniform * 4];
3274
3275 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3276
3277 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3278 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3279 = values[j];
3280 }
3281 }
3282
3283 /* Set up the annotation tracking for new generated instructions. */
3284 base_ir = inst->ir;
3285 current_annotation = inst->annotation;
3286
3287 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3288
3289 emit_pull_constant_load(inst, temp, inst->src[i],
3290 pull_constant_loc[uniform]);
3291
3292 inst->src[i].file = temp.file;
3293 inst->src[i].reg = temp.reg;
3294 inst->src[i].reg_offset = temp.reg_offset;
3295 inst->src[i].reladdr = NULL;
3296 }
3297 }
3298
3299 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3300 * no need to track them as larger-than-vec4 objects. This will be
3301 * relied on in cutting out unused uniform vectors from push
3302 * constants.
3303 */
3304 split_uniform_registers();
3305 }
3306
3307 void
3308 vec4_visitor::resolve_ud_negate(src_reg *reg)
3309 {
3310 if (reg->type != BRW_REGISTER_TYPE_UD ||
3311 !reg->negate)
3312 return;
3313
3314 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3315 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3316 *reg = temp;
3317 }
3318
3319 vec4_visitor::vec4_visitor(struct brw_context *brw,
3320 struct brw_vec4_compile *c,
3321 struct gl_program *prog,
3322 const struct brw_vec4_prog_key *key,
3323 struct brw_vec4_prog_data *prog_data,
3324 struct gl_shader_program *shader_prog,
3325 struct brw_shader *shader,
3326 void *mem_ctx,
3327 bool debug_flag,
3328 bool no_spills,
3329 shader_time_shader_type st_base,
3330 shader_time_shader_type st_written,
3331 shader_time_shader_type st_reset)
3332 : sanity_param_count(0),
3333 fail_msg(NULL),
3334 first_non_payload_grf(0),
3335 need_all_constants_in_pull_buffer(false),
3336 debug_flag(debug_flag),
3337 no_spills(no_spills),
3338 st_base(st_base),
3339 st_written(st_written),
3340 st_reset(st_reset)
3341 {
3342 this->brw = brw;
3343 this->ctx = &brw->ctx;
3344 this->shader_prog = shader_prog;
3345 this->shader = shader;
3346
3347 this->mem_ctx = mem_ctx;
3348 this->failed = false;
3349
3350 this->base_ir = NULL;
3351 this->current_annotation = NULL;
3352 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3353
3354 this->c = c;
3355 this->prog = prog;
3356 this->key = key;
3357 this->prog_data = prog_data;
3358 this->stage_prog_data = &prog_data->base;
3359
3360 this->variable_ht = hash_table_ctor(0,
3361 hash_table_pointer_hash,
3362 hash_table_pointer_compare);
3363
3364 this->virtual_grf_start = NULL;
3365 this->virtual_grf_end = NULL;
3366 this->virtual_grf_sizes = NULL;
3367 this->virtual_grf_count = 0;
3368 this->virtual_grf_reg_map = NULL;
3369 this->virtual_grf_reg_count = 0;
3370 this->virtual_grf_array_size = 0;
3371 this->live_intervals_valid = false;
3372
3373 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3374
3375 this->uniforms = 0;
3376 }
3377
3378 vec4_visitor::~vec4_visitor()
3379 {
3380 hash_table_dtor(this->variable_ht);
3381 }
3382
3383
3384 void
3385 vec4_visitor::fail(const char *format, ...)
3386 {
3387 va_list va;
3388 char *msg;
3389
3390 if (failed)
3391 return;
3392
3393 failed = true;
3394
3395 va_start(va, format);
3396 msg = ralloc_vasprintf(mem_ctx, format, va);
3397 va_end(va);
3398 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3399
3400 this->fail_msg = msg;
3401
3402 if (debug_flag) {
3403 fprintf(stderr, "%s", msg);
3404 }
3405 }
3406
3407 } /* namespace brw */