i965: For color clears, only disable writes to components that exist.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->conditional_mod = BRW_CONDITIONAL_NONE;
46 this->sampler = 0;
47 this->texture_offset = 0;
48 this->target = 0;
49 this->shadow_compare = false;
50 this->ir = v->base_ir;
51 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
52 this->header_present = false;
53 this->mlen = 0;
54 this->base_mrf = 0;
55 this->offset = 0;
56 this->annotation = v->current_annotation;
57 }
58
59 vec4_instruction *
60 vec4_visitor::emit(vec4_instruction *inst)
61 {
62 this->instructions.push_tail(inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
69 {
70 new_inst->ir = inst->ir;
71 new_inst->annotation = inst->annotation;
72
73 inst->insert_before(new_inst);
74
75 return inst;
76 }
77
78 vec4_instruction *
79 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
80 src_reg src0, src_reg src1, src_reg src2)
81 {
82 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
83 src0, src1, src2));
84 }
85
86
87 vec4_instruction *
88 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
89 {
90 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
91 }
92
93 vec4_instruction *
94 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
95 {
96 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
97 }
98
99 vec4_instruction *
100 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
101 {
102 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
103 }
104
105 vec4_instruction *
106 vec4_visitor::emit(enum opcode opcode)
107 {
108 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
109 }
110
111 #define ALU1(op) \
112 vec4_instruction * \
113 vec4_visitor::op(dst_reg dst, src_reg src0) \
114 { \
115 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
116 src0); \
117 }
118
119 #define ALU2(op) \
120 vec4_instruction * \
121 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
122 { \
123 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
124 src0, src1); \
125 }
126
127 #define ALU3(op) \
128 vec4_instruction * \
129 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
130 { \
131 assert(brw->gen >= 6); \
132 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
133 src0, src1, src2); \
134 }
135
136 ALU1(NOT)
137 ALU1(MOV)
138 ALU1(FRC)
139 ALU1(RNDD)
140 ALU1(RNDE)
141 ALU1(RNDZ)
142 ALU1(F32TO16)
143 ALU1(F16TO32)
144 ALU2(ADD)
145 ALU2(MUL)
146 ALU2(MACH)
147 ALU2(AND)
148 ALU2(OR)
149 ALU2(XOR)
150 ALU2(DP3)
151 ALU2(DP4)
152 ALU2(DPH)
153 ALU2(SHL)
154 ALU2(SHR)
155 ALU2(ASR)
156 ALU3(LRP)
157 ALU1(BFREV)
158 ALU3(BFE)
159 ALU2(BFI1)
160 ALU3(BFI2)
161 ALU1(FBH)
162 ALU1(FBL)
163 ALU1(CBIT)
164 ALU3(MAD)
165 ALU2(ADDC)
166 ALU2(SUBB)
167
168 /** Gen4 predicated IF. */
169 vec4_instruction *
170 vec4_visitor::IF(uint32_t predicate)
171 {
172 vec4_instruction *inst;
173
174 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
175 inst->predicate = predicate;
176
177 return inst;
178 }
179
180 /** Gen6 IF with embedded comparison. */
181 vec4_instruction *
182 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
183 {
184 assert(brw->gen == 6);
185
186 vec4_instruction *inst;
187
188 resolve_ud_negate(&src0);
189 resolve_ud_negate(&src1);
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
192 src0, src1);
193 inst->conditional_mod = condition;
194
195 return inst;
196 }
197
198 /**
199 * CMP: Sets the low bit of the destination channels with the result
200 * of the comparison, while the upper bits are undefined, and updates
201 * the flag register with the packed 16 bits of the result.
202 */
203 vec4_instruction *
204 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
205 {
206 vec4_instruction *inst;
207
208 /* original gen4 does type conversion to the destination type
209 * before before comparison, producing garbage results for floating
210 * point comparisons.
211 */
212 if (brw->gen == 4) {
213 dst.type = src0.type;
214 if (dst.file == HW_REG)
215 dst.fixed_hw_reg.type = dst.type;
216 }
217
218 resolve_ud_negate(&src0);
219 resolve_ud_negate(&src1);
220
221 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
222 inst->conditional_mod = condition;
223
224 return inst;
225 }
226
227 vec4_instruction *
228 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
229 {
230 vec4_instruction *inst;
231
232 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
233 dst, index);
234 inst->base_mrf = 14;
235 inst->mlen = 2;
236
237 return inst;
238 }
239
240 vec4_instruction *
241 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
242 {
243 vec4_instruction *inst;
244
245 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
246 dst, src, index);
247 inst->base_mrf = 13;
248 inst->mlen = 3;
249
250 return inst;
251 }
252
253 void
254 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
255 {
256 static enum opcode dot_opcodes[] = {
257 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
258 };
259
260 emit(dot_opcodes[elements - 2], dst, src0, src1);
261 }
262
263 src_reg
264 vec4_visitor::fix_3src_operand(src_reg src)
265 {
266 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
267 * able to use vertical stride of zero to replicate the vec4 uniform, like
268 *
269 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
270 *
271 * But you can't, since vertical stride is always four in three-source
272 * instructions. Instead, insert a MOV instruction to do the replication so
273 * that the three-source instruction can consume it.
274 */
275
276 /* The MOV is only needed if the source is a uniform or immediate. */
277 if (src.file != UNIFORM && src.file != IMM)
278 return src;
279
280 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
281 return src;
282
283 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
284 expanded.type = src.type;
285 emit(MOV(expanded, src));
286 return src_reg(expanded);
287 }
288
289 src_reg
290 vec4_visitor::fix_math_operand(src_reg src)
291 {
292 /* The gen6 math instruction ignores the source modifiers --
293 * swizzle, abs, negate, and at least some parts of the register
294 * region description.
295 *
296 * Rather than trying to enumerate all these cases, *always* expand the
297 * operand to a temp GRF for gen6.
298 *
299 * For gen7, keep the operand as-is, except if immediate, which gen7 still
300 * can't use.
301 */
302
303 if (brw->gen == 7 && src.file != IMM)
304 return src;
305
306 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
307 expanded.type = src.type;
308 emit(MOV(expanded, src));
309 return src_reg(expanded);
310 }
311
312 void
313 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
314 {
315 src = fix_math_operand(src);
316
317 if (dst.writemask != WRITEMASK_XYZW) {
318 /* The gen6 math instruction must be align1, so we can't do
319 * writemasks.
320 */
321 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
322
323 emit(opcode, temp_dst, src);
324
325 emit(MOV(dst, src_reg(temp_dst)));
326 } else {
327 emit(opcode, dst, src);
328 }
329 }
330
331 void
332 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
333 {
334 vec4_instruction *inst = emit(opcode, dst, src);
335 inst->base_mrf = 1;
336 inst->mlen = 1;
337 }
338
339 void
340 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
341 {
342 switch (opcode) {
343 case SHADER_OPCODE_RCP:
344 case SHADER_OPCODE_RSQ:
345 case SHADER_OPCODE_SQRT:
346 case SHADER_OPCODE_EXP2:
347 case SHADER_OPCODE_LOG2:
348 case SHADER_OPCODE_SIN:
349 case SHADER_OPCODE_COS:
350 break;
351 default:
352 assert(!"not reached: bad math opcode");
353 return;
354 }
355
356 if (brw->gen >= 6) {
357 return emit_math1_gen6(opcode, dst, src);
358 } else {
359 return emit_math1_gen4(opcode, dst, src);
360 }
361 }
362
363 void
364 vec4_visitor::emit_math2_gen6(enum opcode opcode,
365 dst_reg dst, src_reg src0, src_reg src1)
366 {
367 src0 = fix_math_operand(src0);
368 src1 = fix_math_operand(src1);
369
370 if (dst.writemask != WRITEMASK_XYZW) {
371 /* The gen6 math instruction must be align1, so we can't do
372 * writemasks.
373 */
374 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
375 temp_dst.type = dst.type;
376
377 emit(opcode, temp_dst, src0, src1);
378
379 emit(MOV(dst, src_reg(temp_dst)));
380 } else {
381 emit(opcode, dst, src0, src1);
382 }
383 }
384
385 void
386 vec4_visitor::emit_math2_gen4(enum opcode opcode,
387 dst_reg dst, src_reg src0, src_reg src1)
388 {
389 vec4_instruction *inst = emit(opcode, dst, src0, src1);
390 inst->base_mrf = 1;
391 inst->mlen = 2;
392 }
393
394 void
395 vec4_visitor::emit_math(enum opcode opcode,
396 dst_reg dst, src_reg src0, src_reg src1)
397 {
398 switch (opcode) {
399 case SHADER_OPCODE_POW:
400 case SHADER_OPCODE_INT_QUOTIENT:
401 case SHADER_OPCODE_INT_REMAINDER:
402 break;
403 default:
404 assert(!"not reached: unsupported binary math opcode");
405 return;
406 }
407
408 if (brw->gen >= 6) {
409 return emit_math2_gen6(opcode, dst, src0, src1);
410 } else {
411 return emit_math2_gen4(opcode, dst, src0, src1);
412 }
413 }
414
415 void
416 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
417 {
418 if (brw->gen < 7)
419 assert(!"ir_unop_pack_half_2x16 should be lowered");
420
421 assert(dst.type == BRW_REGISTER_TYPE_UD);
422 assert(src0.type == BRW_REGISTER_TYPE_F);
423
424 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
425 *
426 * Because this instruction does not have a 16-bit floating-point type,
427 * the destination data type must be Word (W).
428 *
429 * The destination must be DWord-aligned and specify a horizontal stride
430 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
431 * each destination channel and the upper word is not modified.
432 *
433 * The above restriction implies that the f32to16 instruction must use
434 * align1 mode, because only in align1 mode is it possible to specify
435 * horizontal stride. We choose here to defy the hardware docs and emit
436 * align16 instructions.
437 *
438 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
439 * instructions. I was partially successful in that the code passed all
440 * tests. However, the code was dubiously correct and fragile, and the
441 * tests were not harsh enough to probe that frailty. Not trusting the
442 * code, I chose instead to remain in align16 mode in defiance of the hw
443 * docs).
444 *
445 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
446 * simulator, emitting a f32to16 in align16 mode with UD as destination
447 * data type is safe. The behavior differs from that specified in the PRM
448 * in that the upper word of each destination channel is cleared to 0.
449 */
450
451 dst_reg tmp_dst(this, glsl_type::uvec2_type);
452 src_reg tmp_src(tmp_dst);
453
454 #if 0
455 /* Verify the undocumented behavior on which the following instructions
456 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
457 * then the result of the bit-or instruction below will be incorrect.
458 *
459 * You should inspect the disasm output in order to verify that the MOV is
460 * not optimized away.
461 */
462 emit(MOV(tmp_dst, src_reg(0x12345678u)));
463 #endif
464
465 /* Give tmp the form below, where "." means untouched.
466 *
467 * w z y x w z y x
468 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
469 *
470 * That the upper word of each write-channel be 0 is required for the
471 * following bit-shift and bit-or instructions to work. Note that this
472 * relies on the undocumented hardware behavior mentioned above.
473 */
474 tmp_dst.writemask = WRITEMASK_XY;
475 emit(F32TO16(tmp_dst, src0));
476
477 /* Give the write-channels of dst the form:
478 * 0xhhhh0000
479 */
480 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
481 emit(SHL(dst, tmp_src, src_reg(16u)));
482
483 /* Finally, give the write-channels of dst the form of packHalf2x16's
484 * output:
485 * 0xhhhhllll
486 */
487 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
488 emit(OR(dst, src_reg(dst), tmp_src));
489 }
490
491 void
492 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
493 {
494 if (brw->gen < 7)
495 assert(!"ir_unop_unpack_half_2x16 should be lowered");
496
497 assert(dst.type == BRW_REGISTER_TYPE_F);
498 assert(src0.type == BRW_REGISTER_TYPE_UD);
499
500 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
501 *
502 * Because this instruction does not have a 16-bit floating-point type,
503 * the source data type must be Word (W). The destination type must be
504 * F (Float).
505 *
506 * To use W as the source data type, we must adjust horizontal strides,
507 * which is only possible in align1 mode. All my [chadv] attempts at
508 * emitting align1 instructions for unpackHalf2x16 failed to pass the
509 * Piglit tests, so I gave up.
510 *
511 * I've verified that, on gen7 hardware and the simulator, it is safe to
512 * emit f16to32 in align16 mode with UD as source data type.
513 */
514
515 dst_reg tmp_dst(this, glsl_type::uvec2_type);
516 src_reg tmp_src(tmp_dst);
517
518 tmp_dst.writemask = WRITEMASK_X;
519 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
520
521 tmp_dst.writemask = WRITEMASK_Y;
522 emit(SHR(tmp_dst, src0, src_reg(16u)));
523
524 dst.writemask = WRITEMASK_XY;
525 emit(F16TO32(dst, tmp_src));
526 }
527
528 void
529 vec4_visitor::visit_instructions(const exec_list *list)
530 {
531 foreach_list(node, list) {
532 ir_instruction *ir = (ir_instruction *)node;
533
534 base_ir = ir;
535 ir->accept(this);
536 }
537 }
538
539
540 static int
541 type_size(const struct glsl_type *type)
542 {
543 unsigned int i;
544 int size;
545
546 switch (type->base_type) {
547 case GLSL_TYPE_UINT:
548 case GLSL_TYPE_INT:
549 case GLSL_TYPE_FLOAT:
550 case GLSL_TYPE_BOOL:
551 if (type->is_matrix()) {
552 return type->matrix_columns;
553 } else {
554 /* Regardless of size of vector, it gets a vec4. This is bad
555 * packing for things like floats, but otherwise arrays become a
556 * mess. Hopefully a later pass over the code can pack scalars
557 * down if appropriate.
558 */
559 return 1;
560 }
561 case GLSL_TYPE_ARRAY:
562 assert(type->length > 0);
563 return type_size(type->fields.array) * type->length;
564 case GLSL_TYPE_STRUCT:
565 size = 0;
566 for (i = 0; i < type->length; i++) {
567 size += type_size(type->fields.structure[i].type);
568 }
569 return size;
570 case GLSL_TYPE_SAMPLER:
571 /* Samplers take up one slot in UNIFORMS[], but they're baked in
572 * at link time.
573 */
574 return 1;
575 case GLSL_TYPE_ATOMIC_UINT:
576 return 0;
577 case GLSL_TYPE_IMAGE:
578 case GLSL_TYPE_VOID:
579 case GLSL_TYPE_ERROR:
580 case GLSL_TYPE_INTERFACE:
581 assert(0);
582 break;
583 }
584
585 return 0;
586 }
587
588 int
589 vec4_visitor::virtual_grf_alloc(int size)
590 {
591 if (virtual_grf_array_size <= virtual_grf_count) {
592 if (virtual_grf_array_size == 0)
593 virtual_grf_array_size = 16;
594 else
595 virtual_grf_array_size *= 2;
596 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
597 virtual_grf_array_size);
598 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
599 virtual_grf_array_size);
600 }
601 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
602 virtual_grf_reg_count += size;
603 virtual_grf_sizes[virtual_grf_count] = size;
604 return virtual_grf_count++;
605 }
606
607 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
608 {
609 init();
610
611 this->file = GRF;
612 this->reg = v->virtual_grf_alloc(type_size(type));
613
614 if (type->is_array() || type->is_record()) {
615 this->swizzle = BRW_SWIZZLE_NOOP;
616 } else {
617 this->swizzle = swizzle_for_size(type->vector_elements);
618 }
619
620 this->type = brw_type_for_base_type(type);
621 }
622
623 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
624 {
625 init();
626
627 this->file = GRF;
628 this->reg = v->virtual_grf_alloc(type_size(type));
629
630 if (type->is_array() || type->is_record()) {
631 this->writemask = WRITEMASK_XYZW;
632 } else {
633 this->writemask = (1 << type->vector_elements) - 1;
634 }
635
636 this->type = brw_type_for_base_type(type);
637 }
638
639 /* Our support for uniforms is piggy-backed on the struct
640 * gl_fragment_program, because that's where the values actually
641 * get stored, rather than in some global gl_shader_program uniform
642 * store.
643 */
644 void
645 vec4_visitor::setup_uniform_values(ir_variable *ir)
646 {
647 int namelen = strlen(ir->name);
648
649 /* The data for our (non-builtin) uniforms is stored in a series of
650 * gl_uniform_driver_storage structs for each subcomponent that
651 * glGetUniformLocation() could name. We know it's been set up in the same
652 * order we'd walk the type, so walk the list of storage and find anything
653 * with our name, or the prefix of a component that starts with our name.
654 */
655 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
656 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
657
658 if (strncmp(ir->name, storage->name, namelen) != 0 ||
659 (storage->name[namelen] != 0 &&
660 storage->name[namelen] != '.' &&
661 storage->name[namelen] != '[')) {
662 continue;
663 }
664
665 gl_constant_value *components = storage->storage;
666 unsigned vector_count = (MAX2(storage->array_elements, 1) *
667 storage->type->matrix_columns);
668
669 for (unsigned s = 0; s < vector_count; s++) {
670 assert(uniforms < uniform_array_size);
671 uniform_vector_size[uniforms] = storage->type->vector_elements;
672
673 int i;
674 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
675 stage_prog_data->param[uniforms * 4 + i] = &components->f;
676 components++;
677 }
678 for (; i < 4; i++) {
679 static float zero = 0;
680 stage_prog_data->param[uniforms * 4 + i] = &zero;
681 }
682
683 uniforms++;
684 }
685 }
686 }
687
688 void
689 vec4_visitor::setup_uniform_clipplane_values()
690 {
691 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
692
693 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
694 assert(this->uniforms < uniform_array_size);
695 this->uniform_vector_size[this->uniforms] = 4;
696 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
697 this->userplane[i].type = BRW_REGISTER_TYPE_F;
698 for (int j = 0; j < 4; ++j) {
699 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
700 }
701 ++this->uniforms;
702 }
703 }
704
705 /* Our support for builtin uniforms is even scarier than non-builtin.
706 * It sits on top of the PROG_STATE_VAR parameters that are
707 * automatically updated from GL context state.
708 */
709 void
710 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
711 {
712 const ir_state_slot *const slots = ir->state_slots;
713 assert(ir->state_slots != NULL);
714
715 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
716 /* This state reference has already been setup by ir_to_mesa,
717 * but we'll get the same index back here. We can reference
718 * ParameterValues directly, since unlike brw_fs.cpp, we never
719 * add new state references during compile.
720 */
721 int index = _mesa_add_state_reference(this->prog->Parameters,
722 (gl_state_index *)slots[i].tokens);
723 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
724
725 assert(this->uniforms < uniform_array_size);
726 this->uniform_vector_size[this->uniforms] = 0;
727 /* Add each of the unique swizzled channels of the element.
728 * This will end up matching the size of the glsl_type of this field.
729 */
730 int last_swiz = -1;
731 for (unsigned int j = 0; j < 4; j++) {
732 int swiz = GET_SWZ(slots[i].swizzle, j);
733 last_swiz = swiz;
734
735 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
736 assert(this->uniforms < uniform_array_size);
737 if (swiz <= last_swiz)
738 this->uniform_vector_size[this->uniforms]++;
739 }
740 this->uniforms++;
741 }
742 }
743
744 dst_reg *
745 vec4_visitor::variable_storage(ir_variable *var)
746 {
747 return (dst_reg *)hash_table_find(this->variable_ht, var);
748 }
749
750 void
751 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
752 {
753 ir_expression *expr = ir->as_expression();
754
755 *predicate = BRW_PREDICATE_NORMAL;
756
757 if (expr) {
758 src_reg op[2];
759 vec4_instruction *inst;
760
761 assert(expr->get_num_operands() <= 2);
762 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
763 expr->operands[i]->accept(this);
764 op[i] = this->result;
765
766 resolve_ud_negate(&op[i]);
767 }
768
769 switch (expr->operation) {
770 case ir_unop_logic_not:
771 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
772 inst->conditional_mod = BRW_CONDITIONAL_Z;
773 break;
774
775 case ir_binop_logic_xor:
776 inst = emit(XOR(dst_null_d(), op[0], op[1]));
777 inst->conditional_mod = BRW_CONDITIONAL_NZ;
778 break;
779
780 case ir_binop_logic_or:
781 inst = emit(OR(dst_null_d(), op[0], op[1]));
782 inst->conditional_mod = BRW_CONDITIONAL_NZ;
783 break;
784
785 case ir_binop_logic_and:
786 inst = emit(AND(dst_null_d(), op[0], op[1]));
787 inst->conditional_mod = BRW_CONDITIONAL_NZ;
788 break;
789
790 case ir_unop_f2b:
791 if (brw->gen >= 6) {
792 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
793 } else {
794 inst = emit(MOV(dst_null_f(), op[0]));
795 inst->conditional_mod = BRW_CONDITIONAL_NZ;
796 }
797 break;
798
799 case ir_unop_i2b:
800 if (brw->gen >= 6) {
801 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
802 } else {
803 inst = emit(MOV(dst_null_d(), op[0]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 }
806 break;
807
808 case ir_binop_all_equal:
809 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
810 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
811 break;
812
813 case ir_binop_any_nequal:
814 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
815 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
816 break;
817
818 case ir_unop_any:
819 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
820 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
821 break;
822
823 case ir_binop_greater:
824 case ir_binop_gequal:
825 case ir_binop_less:
826 case ir_binop_lequal:
827 case ir_binop_equal:
828 case ir_binop_nequal:
829 emit(CMP(dst_null_d(), op[0], op[1],
830 brw_conditional_for_comparison(expr->operation)));
831 break;
832
833 default:
834 assert(!"not reached");
835 break;
836 }
837 return;
838 }
839
840 ir->accept(this);
841
842 resolve_ud_negate(&this->result);
843
844 if (brw->gen >= 6) {
845 vec4_instruction *inst = emit(AND(dst_null_d(),
846 this->result, src_reg(1)));
847 inst->conditional_mod = BRW_CONDITIONAL_NZ;
848 } else {
849 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
850 inst->conditional_mod = BRW_CONDITIONAL_NZ;
851 }
852 }
853
854 /**
855 * Emit a gen6 IF statement with the comparison folded into the IF
856 * instruction.
857 */
858 void
859 vec4_visitor::emit_if_gen6(ir_if *ir)
860 {
861 ir_expression *expr = ir->condition->as_expression();
862
863 if (expr) {
864 src_reg op[2];
865 dst_reg temp;
866
867 assert(expr->get_num_operands() <= 2);
868 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
869 expr->operands[i]->accept(this);
870 op[i] = this->result;
871 }
872
873 switch (expr->operation) {
874 case ir_unop_logic_not:
875 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
876 return;
877
878 case ir_binop_logic_xor:
879 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
880 return;
881
882 case ir_binop_logic_or:
883 temp = dst_reg(this, glsl_type::bool_type);
884 emit(OR(temp, op[0], op[1]));
885 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
886 return;
887
888 case ir_binop_logic_and:
889 temp = dst_reg(this, glsl_type::bool_type);
890 emit(AND(temp, op[0], op[1]));
891 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_unop_f2b:
895 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
896 return;
897
898 case ir_unop_i2b:
899 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
900 return;
901
902 case ir_binop_greater:
903 case ir_binop_gequal:
904 case ir_binop_less:
905 case ir_binop_lequal:
906 case ir_binop_equal:
907 case ir_binop_nequal:
908 emit(IF(op[0], op[1],
909 brw_conditional_for_comparison(expr->operation)));
910 return;
911
912 case ir_binop_all_equal:
913 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
914 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
915 return;
916
917 case ir_binop_any_nequal:
918 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
919 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
920 return;
921
922 case ir_unop_any:
923 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
924 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
925 return;
926
927 default:
928 assert(!"not reached");
929 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
930 return;
931 }
932 return;
933 }
934
935 ir->condition->accept(this);
936
937 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
938 }
939
940 void
941 vec4_visitor::visit(ir_variable *ir)
942 {
943 dst_reg *reg = NULL;
944
945 if (variable_storage(ir))
946 return;
947
948 switch (ir->data.mode) {
949 case ir_var_shader_in:
950 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
951 break;
952
953 case ir_var_shader_out:
954 reg = new(mem_ctx) dst_reg(this, ir->type);
955
956 for (int i = 0; i < type_size(ir->type); i++) {
957 output_reg[ir->data.location + i] = *reg;
958 output_reg[ir->data.location + i].reg_offset = i;
959 output_reg[ir->data.location + i].type =
960 brw_type_for_base_type(ir->type->get_scalar_type());
961 output_reg_annotation[ir->data.location + i] = ir->name;
962 }
963 break;
964
965 case ir_var_auto:
966 case ir_var_temporary:
967 reg = new(mem_ctx) dst_reg(this, ir->type);
968 break;
969
970 case ir_var_uniform:
971 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
972
973 /* Thanks to the lower_ubo_reference pass, we will see only
974 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
975 * variables, so no need for them to be in variable_ht.
976 *
977 * Atomic counters take no uniform storage, no need to do
978 * anything here.
979 */
980 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
981 return;
982
983 /* Track how big the whole uniform variable is, in case we need to put a
984 * copy of its data into pull constants for array access.
985 */
986 assert(this->uniforms < uniform_array_size);
987 this->uniform_size[this->uniforms] = type_size(ir->type);
988
989 if (!strncmp(ir->name, "gl_", 3)) {
990 setup_builtin_uniform_values(ir);
991 } else {
992 setup_uniform_values(ir);
993 }
994 break;
995
996 case ir_var_system_value:
997 reg = make_reg_for_system_value(ir);
998 break;
999
1000 default:
1001 assert(!"not reached");
1002 }
1003
1004 reg->type = brw_type_for_base_type(ir->type);
1005 hash_table_insert(this->variable_ht, reg, ir);
1006 }
1007
1008 void
1009 vec4_visitor::visit(ir_loop *ir)
1010 {
1011 /* We don't want debugging output to print the whole body of the
1012 * loop as the annotation.
1013 */
1014 this->base_ir = NULL;
1015
1016 emit(BRW_OPCODE_DO);
1017
1018 visit_instructions(&ir->body_instructions);
1019
1020 emit(BRW_OPCODE_WHILE);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop_jump *ir)
1025 {
1026 switch (ir->mode) {
1027 case ir_loop_jump::jump_break:
1028 emit(BRW_OPCODE_BREAK);
1029 break;
1030 case ir_loop_jump::jump_continue:
1031 emit(BRW_OPCODE_CONTINUE);
1032 break;
1033 }
1034 }
1035
1036
1037 void
1038 vec4_visitor::visit(ir_function_signature *ir)
1039 {
1040 assert(0);
1041 (void)ir;
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_function *ir)
1046 {
1047 /* Ignore function bodies other than main() -- we shouldn't see calls to
1048 * them since they should all be inlined.
1049 */
1050 if (strcmp(ir->name, "main") == 0) {
1051 const ir_function_signature *sig;
1052 exec_list empty;
1053
1054 sig = ir->matching_signature(NULL, &empty);
1055
1056 assert(sig);
1057
1058 visit_instructions(&sig->body);
1059 }
1060 }
1061
1062 bool
1063 vec4_visitor::try_emit_sat(ir_expression *ir)
1064 {
1065 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1066 if (!sat_src)
1067 return false;
1068
1069 sat_src->accept(this);
1070 src_reg src = this->result;
1071
1072 this->result = src_reg(this, ir->type);
1073 vec4_instruction *inst;
1074 inst = emit(MOV(dst_reg(this->result), src));
1075 inst->saturate = true;
1076
1077 return true;
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
1082 {
1083 /* 3-src instructions were introduced in gen6. */
1084 if (brw->gen < 6)
1085 return false;
1086
1087 /* MAD can only handle floating-point data. */
1088 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089 return false;
1090
1091 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
1092 ir_expression *mul = ir->operands[mul_arg]->as_expression();
1093
1094 if (!mul || mul->operation != ir_binop_mul)
1095 return false;
1096
1097 nonmul->accept(this);
1098 src_reg src0 = fix_3src_operand(this->result);
1099
1100 mul->operands[0]->accept(this);
1101 src_reg src1 = fix_3src_operand(this->result);
1102
1103 mul->operands[1]->accept(this);
1104 src_reg src2 = fix_3src_operand(this->result);
1105
1106 this->result = src_reg(this, ir->type);
1107 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1108
1109 return true;
1110 }
1111
1112 void
1113 vec4_visitor::emit_bool_comparison(unsigned int op,
1114 dst_reg dst, src_reg src0, src_reg src1)
1115 {
1116 /* original gen4 does destination conversion before comparison. */
1117 if (brw->gen < 5)
1118 dst.type = src0.type;
1119
1120 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1121
1122 dst.type = BRW_REGISTER_TYPE_D;
1123 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1124 }
1125
1126 void
1127 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1128 src_reg src0, src_reg src1)
1129 {
1130 vec4_instruction *inst;
1131
1132 if (brw->gen >= 6) {
1133 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1134 inst->conditional_mod = conditionalmod;
1135 } else {
1136 emit(CMP(dst, src0, src1, conditionalmod));
1137
1138 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1139 inst->predicate = BRW_PREDICATE_NORMAL;
1140 }
1141 }
1142
1143 void
1144 vec4_visitor::emit_lrp(const dst_reg &dst,
1145 const src_reg &x, const src_reg &y, const src_reg &a)
1146 {
1147 if (brw->gen >= 6) {
1148 /* Note that the instruction's argument order is reversed from GLSL
1149 * and the IR.
1150 */
1151 emit(LRP(dst,
1152 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1153 } else {
1154 /* Earlier generations don't support three source operations, so we
1155 * need to emit x*(1-a) + y*a.
1156 *
1157 * A better way to do this would be:
1158 * ADD one_minus_a, negate(a), 1.0f
1159 * MUL null, y, a
1160 * MAC dst, x, one_minus_a
1161 * but we would need to support MAC and implicit accumulator.
1162 */
1163 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1164 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1165 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1166 y_times_a.writemask = dst.writemask;
1167 one_minus_a.writemask = dst.writemask;
1168 x_times_one_minus_a.writemask = dst.writemask;
1169
1170 emit(MUL(y_times_a, y, a));
1171 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1172 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1173 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1174 }
1175 }
1176
1177 static bool
1178 is_16bit_constant(ir_rvalue *rvalue)
1179 {
1180 ir_constant *constant = rvalue->as_constant();
1181 if (!constant)
1182 return false;
1183
1184 if (constant->type != glsl_type::int_type &&
1185 constant->type != glsl_type::uint_type)
1186 return false;
1187
1188 return constant->value.u[0] < (1 << 16);
1189 }
1190
1191 void
1192 vec4_visitor::visit(ir_expression *ir)
1193 {
1194 unsigned int operand;
1195 src_reg op[Elements(ir->operands)];
1196 src_reg result_src;
1197 dst_reg result_dst;
1198 vec4_instruction *inst;
1199
1200 if (try_emit_sat(ir))
1201 return;
1202
1203 if (ir->operation == ir_binop_add) {
1204 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
1205 return;
1206 }
1207
1208 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1209 this->result.file = BAD_FILE;
1210 ir->operands[operand]->accept(this);
1211 if (this->result.file == BAD_FILE) {
1212 fprintf(stderr, "Failed to get tree for expression operand:\n");
1213 ir->operands[operand]->fprint(stderr);
1214 exit(1);
1215 }
1216 op[operand] = this->result;
1217
1218 /* Matrix expression operands should have been broken down to vector
1219 * operations already.
1220 */
1221 assert(!ir->operands[operand]->type->is_matrix());
1222 }
1223
1224 int vector_elements = ir->operands[0]->type->vector_elements;
1225 if (ir->operands[1]) {
1226 vector_elements = MAX2(vector_elements,
1227 ir->operands[1]->type->vector_elements);
1228 }
1229
1230 this->result.file = BAD_FILE;
1231
1232 /* Storage for our result. Ideally for an assignment we'd be using
1233 * the actual storage for the result here, instead.
1234 */
1235 result_src = src_reg(this, ir->type);
1236 /* convenience for the emit functions below. */
1237 result_dst = dst_reg(result_src);
1238 /* If nothing special happens, this is the result. */
1239 this->result = result_src;
1240 /* Limit writes to the channels that will be used by result_src later.
1241 * This does limit this temp's use as a temporary for multi-instruction
1242 * sequences.
1243 */
1244 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1245
1246 switch (ir->operation) {
1247 case ir_unop_logic_not:
1248 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1249 * ones complement of the whole register, not just bit 0.
1250 */
1251 emit(XOR(result_dst, op[0], src_reg(1)));
1252 break;
1253 case ir_unop_neg:
1254 op[0].negate = !op[0].negate;
1255 emit(MOV(result_dst, op[0]));
1256 break;
1257 case ir_unop_abs:
1258 op[0].abs = true;
1259 op[0].negate = false;
1260 emit(MOV(result_dst, op[0]));
1261 break;
1262
1263 case ir_unop_sign:
1264 if (ir->type->is_float()) {
1265 /* AND(val, 0x80000000) gives the sign bit.
1266 *
1267 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1268 * zero.
1269 */
1270 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1271
1272 op[0].type = BRW_REGISTER_TYPE_UD;
1273 result_dst.type = BRW_REGISTER_TYPE_UD;
1274 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1275
1276 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1277 inst->predicate = BRW_PREDICATE_NORMAL;
1278
1279 this->result.type = BRW_REGISTER_TYPE_F;
1280 } else {
1281 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1282 * -> non-negative val generates 0x00000000.
1283 * Predicated OR sets 1 if val is positive.
1284 */
1285 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1286
1287 emit(ASR(result_dst, op[0], src_reg(31)));
1288
1289 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1290 inst->predicate = BRW_PREDICATE_NORMAL;
1291 }
1292 break;
1293
1294 case ir_unop_rcp:
1295 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1296 break;
1297
1298 case ir_unop_exp2:
1299 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1300 break;
1301 case ir_unop_log2:
1302 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1303 break;
1304 case ir_unop_exp:
1305 case ir_unop_log:
1306 assert(!"not reached: should be handled by ir_explog_to_explog2");
1307 break;
1308 case ir_unop_sin:
1309 case ir_unop_sin_reduced:
1310 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1311 break;
1312 case ir_unop_cos:
1313 case ir_unop_cos_reduced:
1314 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1315 break;
1316
1317 case ir_unop_dFdx:
1318 case ir_unop_dFdy:
1319 assert(!"derivatives not valid in vertex shader");
1320 break;
1321
1322 case ir_unop_bitfield_reverse:
1323 emit(BFREV(result_dst, op[0]));
1324 break;
1325 case ir_unop_bit_count:
1326 emit(CBIT(result_dst, op[0]));
1327 break;
1328 case ir_unop_find_msb: {
1329 src_reg temp = src_reg(this, glsl_type::uint_type);
1330
1331 inst = emit(FBH(dst_reg(temp), op[0]));
1332 inst->dst.writemask = WRITEMASK_XYZW;
1333
1334 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1335 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1336 * subtract the result from 31 to convert the MSB count into an LSB count.
1337 */
1338
1339 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1340 temp.swizzle = BRW_SWIZZLE_NOOP;
1341 emit(MOV(result_dst, temp));
1342
1343 src_reg src_tmp = src_reg(result_dst);
1344 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1345
1346 src_tmp.negate = true;
1347 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1348 inst->predicate = BRW_PREDICATE_NORMAL;
1349 break;
1350 }
1351 case ir_unop_find_lsb:
1352 emit(FBL(result_dst, op[0]));
1353 break;
1354
1355 case ir_unop_noise:
1356 assert(!"not reached: should be handled by lower_noise");
1357 break;
1358
1359 case ir_binop_add:
1360 emit(ADD(result_dst, op[0], op[1]));
1361 break;
1362 case ir_binop_sub:
1363 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1364 break;
1365
1366 case ir_binop_mul:
1367 if (brw->gen < 8 && ir->type->is_integer()) {
1368 /* For integer multiplication, the MUL uses the low 16 bits of one of
1369 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1370 * accumulates in the contribution of the upper 16 bits of that
1371 * operand. If we can determine that one of the args is in the low
1372 * 16 bits, though, we can just emit a single MUL.
1373 */
1374 if (is_16bit_constant(ir->operands[0])) {
1375 if (brw->gen < 7)
1376 emit(MUL(result_dst, op[0], op[1]));
1377 else
1378 emit(MUL(result_dst, op[1], op[0]));
1379 } else if (is_16bit_constant(ir->operands[1])) {
1380 if (brw->gen < 7)
1381 emit(MUL(result_dst, op[1], op[0]));
1382 else
1383 emit(MUL(result_dst, op[0], op[1]));
1384 } else {
1385 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1386
1387 emit(MUL(acc, op[0], op[1]));
1388 emit(MACH(dst_null_d(), op[0], op[1]));
1389 emit(MOV(result_dst, src_reg(acc)));
1390 }
1391 } else {
1392 emit(MUL(result_dst, op[0], op[1]));
1393 }
1394 break;
1395 case ir_binop_imul_high: {
1396 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1397
1398 emit(MUL(acc, op[0], op[1]));
1399 emit(MACH(result_dst, op[0], op[1]));
1400 break;
1401 }
1402 case ir_binop_div:
1403 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1404 assert(ir->type->is_integer());
1405 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1406 break;
1407 case ir_binop_carry: {
1408 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1409
1410 emit(ADDC(dst_null_ud(), op[0], op[1]));
1411 emit(MOV(result_dst, src_reg(acc)));
1412 break;
1413 }
1414 case ir_binop_borrow: {
1415 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1416
1417 emit(SUBB(dst_null_ud(), op[0], op[1]));
1418 emit(MOV(result_dst, src_reg(acc)));
1419 break;
1420 }
1421 case ir_binop_mod:
1422 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1423 assert(ir->type->is_integer());
1424 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1425 break;
1426
1427 case ir_binop_less:
1428 case ir_binop_greater:
1429 case ir_binop_lequal:
1430 case ir_binop_gequal:
1431 case ir_binop_equal:
1432 case ir_binop_nequal: {
1433 emit(CMP(result_dst, op[0], op[1],
1434 brw_conditional_for_comparison(ir->operation)));
1435 emit(AND(result_dst, result_src, src_reg(0x1)));
1436 break;
1437 }
1438
1439 case ir_binop_all_equal:
1440 /* "==" operator producing a scalar boolean. */
1441 if (ir->operands[0]->type->is_vector() ||
1442 ir->operands[1]->type->is_vector()) {
1443 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1444 emit(MOV(result_dst, src_reg(0)));
1445 inst = emit(MOV(result_dst, src_reg(1)));
1446 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1447 } else {
1448 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1449 emit(AND(result_dst, result_src, src_reg(0x1)));
1450 }
1451 break;
1452 case ir_binop_any_nequal:
1453 /* "!=" operator producing a scalar boolean. */
1454 if (ir->operands[0]->type->is_vector() ||
1455 ir->operands[1]->type->is_vector()) {
1456 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1457
1458 emit(MOV(result_dst, src_reg(0)));
1459 inst = emit(MOV(result_dst, src_reg(1)));
1460 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1461 } else {
1462 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1463 emit(AND(result_dst, result_src, src_reg(0x1)));
1464 }
1465 break;
1466
1467 case ir_unop_any:
1468 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1469 emit(MOV(result_dst, src_reg(0)));
1470
1471 inst = emit(MOV(result_dst, src_reg(1)));
1472 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1473 break;
1474
1475 case ir_binop_logic_xor:
1476 emit(XOR(result_dst, op[0], op[1]));
1477 break;
1478
1479 case ir_binop_logic_or:
1480 emit(OR(result_dst, op[0], op[1]));
1481 break;
1482
1483 case ir_binop_logic_and:
1484 emit(AND(result_dst, op[0], op[1]));
1485 break;
1486
1487 case ir_binop_dot:
1488 assert(ir->operands[0]->type->is_vector());
1489 assert(ir->operands[0]->type == ir->operands[1]->type);
1490 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1491 break;
1492
1493 case ir_unop_sqrt:
1494 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1495 break;
1496 case ir_unop_rsq:
1497 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1498 break;
1499
1500 case ir_unop_bitcast_i2f:
1501 case ir_unop_bitcast_u2f:
1502 this->result = op[0];
1503 this->result.type = BRW_REGISTER_TYPE_F;
1504 break;
1505
1506 case ir_unop_bitcast_f2i:
1507 this->result = op[0];
1508 this->result.type = BRW_REGISTER_TYPE_D;
1509 break;
1510
1511 case ir_unop_bitcast_f2u:
1512 this->result = op[0];
1513 this->result.type = BRW_REGISTER_TYPE_UD;
1514 break;
1515
1516 case ir_unop_i2f:
1517 case ir_unop_i2u:
1518 case ir_unop_u2i:
1519 case ir_unop_u2f:
1520 case ir_unop_b2f:
1521 case ir_unop_b2i:
1522 case ir_unop_f2i:
1523 case ir_unop_f2u:
1524 emit(MOV(result_dst, op[0]));
1525 break;
1526 case ir_unop_f2b:
1527 case ir_unop_i2b: {
1528 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1529 emit(AND(result_dst, result_src, src_reg(1)));
1530 break;
1531 }
1532
1533 case ir_unop_trunc:
1534 emit(RNDZ(result_dst, op[0]));
1535 break;
1536 case ir_unop_ceil:
1537 op[0].negate = !op[0].negate;
1538 inst = emit(RNDD(result_dst, op[0]));
1539 this->result.negate = true;
1540 break;
1541 case ir_unop_floor:
1542 inst = emit(RNDD(result_dst, op[0]));
1543 break;
1544 case ir_unop_fract:
1545 inst = emit(FRC(result_dst, op[0]));
1546 break;
1547 case ir_unop_round_even:
1548 emit(RNDE(result_dst, op[0]));
1549 break;
1550
1551 case ir_binop_min:
1552 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1553 break;
1554 case ir_binop_max:
1555 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1556 break;
1557
1558 case ir_binop_pow:
1559 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1560 break;
1561
1562 case ir_unop_bit_not:
1563 inst = emit(NOT(result_dst, op[0]));
1564 break;
1565 case ir_binop_bit_and:
1566 inst = emit(AND(result_dst, op[0], op[1]));
1567 break;
1568 case ir_binop_bit_xor:
1569 inst = emit(XOR(result_dst, op[0], op[1]));
1570 break;
1571 case ir_binop_bit_or:
1572 inst = emit(OR(result_dst, op[0], op[1]));
1573 break;
1574
1575 case ir_binop_lshift:
1576 inst = emit(SHL(result_dst, op[0], op[1]));
1577 break;
1578
1579 case ir_binop_rshift:
1580 if (ir->type->base_type == GLSL_TYPE_INT)
1581 inst = emit(ASR(result_dst, op[0], op[1]));
1582 else
1583 inst = emit(SHR(result_dst, op[0], op[1]));
1584 break;
1585
1586 case ir_binop_bfm:
1587 emit(BFI1(result_dst, op[0], op[1]));
1588 break;
1589
1590 case ir_binop_ubo_load: {
1591 ir_constant *uniform_block = ir->operands[0]->as_constant();
1592 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1593 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1594 src_reg offset;
1595
1596 /* Now, load the vector from that offset. */
1597 assert(ir->type->is_vector() || ir->type->is_scalar());
1598
1599 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1600 packed_consts.type = result.type;
1601 src_reg surf_index =
1602 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1603 if (const_offset_ir) {
1604 if (brw->gen >= 8) {
1605 /* Store the offset in a GRF so we can send-from-GRF. */
1606 offset = src_reg(this, glsl_type::int_type);
1607 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1608 } else {
1609 /* Immediates are fine on older generations since they'll be moved
1610 * to a (potentially fake) MRF at the generator level.
1611 */
1612 offset = src_reg(const_offset / 16);
1613 }
1614 } else {
1615 offset = src_reg(this, glsl_type::uint_type);
1616 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1617 }
1618
1619 if (brw->gen >= 7) {
1620 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1621 grf_offset.type = offset.type;
1622
1623 emit(MOV(grf_offset, offset));
1624
1625 emit(new(mem_ctx) vec4_instruction(this,
1626 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1627 dst_reg(packed_consts),
1628 surf_index,
1629 src_reg(grf_offset)));
1630 } else {
1631 vec4_instruction *pull =
1632 emit(new(mem_ctx) vec4_instruction(this,
1633 VS_OPCODE_PULL_CONSTANT_LOAD,
1634 dst_reg(packed_consts),
1635 surf_index,
1636 offset));
1637 pull->base_mrf = 14;
1638 pull->mlen = 1;
1639 }
1640
1641 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1642 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1643 const_offset % 16 / 4,
1644 const_offset % 16 / 4,
1645 const_offset % 16 / 4);
1646
1647 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1648 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1649 emit(CMP(result_dst, packed_consts, src_reg(0u),
1650 BRW_CONDITIONAL_NZ));
1651 emit(AND(result_dst, result, src_reg(0x1)));
1652 } else {
1653 emit(MOV(result_dst, packed_consts));
1654 }
1655 break;
1656 }
1657
1658 case ir_binop_vector_extract:
1659 assert(!"should have been lowered by vec_index_to_cond_assign");
1660 break;
1661
1662 case ir_triop_fma:
1663 op[0] = fix_3src_operand(op[0]);
1664 op[1] = fix_3src_operand(op[1]);
1665 op[2] = fix_3src_operand(op[2]);
1666 /* Note that the instruction's argument order is reversed from GLSL
1667 * and the IR.
1668 */
1669 emit(MAD(result_dst, op[2], op[1], op[0]));
1670 break;
1671
1672 case ir_triop_lrp:
1673 emit_lrp(result_dst, op[0], op[1], op[2]);
1674 break;
1675
1676 case ir_triop_csel:
1677 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1678 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1679 inst->predicate = BRW_PREDICATE_NORMAL;
1680 break;
1681
1682 case ir_triop_bfi:
1683 op[0] = fix_3src_operand(op[0]);
1684 op[1] = fix_3src_operand(op[1]);
1685 op[2] = fix_3src_operand(op[2]);
1686 emit(BFI2(result_dst, op[0], op[1], op[2]));
1687 break;
1688
1689 case ir_triop_bitfield_extract:
1690 op[0] = fix_3src_operand(op[0]);
1691 op[1] = fix_3src_operand(op[1]);
1692 op[2] = fix_3src_operand(op[2]);
1693 /* Note that the instruction's argument order is reversed from GLSL
1694 * and the IR.
1695 */
1696 emit(BFE(result_dst, op[2], op[1], op[0]));
1697 break;
1698
1699 case ir_triop_vector_insert:
1700 assert(!"should have been lowered by lower_vector_insert");
1701 break;
1702
1703 case ir_quadop_bitfield_insert:
1704 assert(!"not reached: should be handled by "
1705 "bitfield_insert_to_bfm_bfi\n");
1706 break;
1707
1708 case ir_quadop_vector:
1709 assert(!"not reached: should be handled by lower_quadop_vector");
1710 break;
1711
1712 case ir_unop_pack_half_2x16:
1713 emit_pack_half_2x16(result_dst, op[0]);
1714 break;
1715 case ir_unop_unpack_half_2x16:
1716 emit_unpack_half_2x16(result_dst, op[0]);
1717 break;
1718 case ir_unop_pack_snorm_2x16:
1719 case ir_unop_pack_snorm_4x8:
1720 case ir_unop_pack_unorm_2x16:
1721 case ir_unop_pack_unorm_4x8:
1722 case ir_unop_unpack_snorm_2x16:
1723 case ir_unop_unpack_snorm_4x8:
1724 case ir_unop_unpack_unorm_2x16:
1725 case ir_unop_unpack_unorm_4x8:
1726 assert(!"not reached: should be handled by lower_packing_builtins");
1727 break;
1728 case ir_unop_unpack_half_2x16_split_x:
1729 case ir_unop_unpack_half_2x16_split_y:
1730 case ir_binop_pack_half_2x16_split:
1731 assert(!"not reached: should not occur in vertex shader");
1732 break;
1733 case ir_binop_ldexp:
1734 assert(!"not reached: should be handled by ldexp_to_arith()");
1735 break;
1736 }
1737 }
1738
1739
1740 void
1741 vec4_visitor::visit(ir_swizzle *ir)
1742 {
1743 src_reg src;
1744 int i = 0;
1745 int swizzle[4];
1746
1747 /* Note that this is only swizzles in expressions, not those on the left
1748 * hand side of an assignment, which do write masking. See ir_assignment
1749 * for that.
1750 */
1751
1752 ir->val->accept(this);
1753 src = this->result;
1754 assert(src.file != BAD_FILE);
1755
1756 for (i = 0; i < ir->type->vector_elements; i++) {
1757 switch (i) {
1758 case 0:
1759 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1760 break;
1761 case 1:
1762 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1763 break;
1764 case 2:
1765 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1766 break;
1767 case 3:
1768 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1769 break;
1770 }
1771 }
1772 for (; i < 4; i++) {
1773 /* Replicate the last channel out. */
1774 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1775 }
1776
1777 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1778
1779 this->result = src;
1780 }
1781
1782 void
1783 vec4_visitor::visit(ir_dereference_variable *ir)
1784 {
1785 const struct glsl_type *type = ir->type;
1786 dst_reg *reg = variable_storage(ir->var);
1787
1788 if (!reg) {
1789 fail("Failed to find variable storage for %s\n", ir->var->name);
1790 this->result = src_reg(brw_null_reg());
1791 return;
1792 }
1793
1794 this->result = src_reg(*reg);
1795
1796 /* System values get their swizzle from the dst_reg writemask */
1797 if (ir->var->data.mode == ir_var_system_value)
1798 return;
1799
1800 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1801 this->result.swizzle = swizzle_for_size(type->vector_elements);
1802 }
1803
1804
1805 int
1806 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1807 {
1808 /* Under normal circumstances array elements are stored consecutively, so
1809 * the stride is equal to the size of the array element.
1810 */
1811 return type_size(ir->type);
1812 }
1813
1814
1815 void
1816 vec4_visitor::visit(ir_dereference_array *ir)
1817 {
1818 ir_constant *constant_index;
1819 src_reg src;
1820 int array_stride = compute_array_stride(ir);
1821
1822 constant_index = ir->array_index->constant_expression_value();
1823
1824 ir->array->accept(this);
1825 src = this->result;
1826
1827 if (constant_index) {
1828 src.reg_offset += constant_index->value.i[0] * array_stride;
1829 } else {
1830 /* Variable index array dereference. It eats the "vec4" of the
1831 * base of the array and an index that offsets the Mesa register
1832 * index.
1833 */
1834 ir->array_index->accept(this);
1835
1836 src_reg index_reg;
1837
1838 if (array_stride == 1) {
1839 index_reg = this->result;
1840 } else {
1841 index_reg = src_reg(this, glsl_type::int_type);
1842
1843 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1844 }
1845
1846 if (src.reladdr) {
1847 src_reg temp = src_reg(this, glsl_type::int_type);
1848
1849 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1850
1851 index_reg = temp;
1852 }
1853
1854 src.reladdr = ralloc(mem_ctx, src_reg);
1855 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1856 }
1857
1858 /* If the type is smaller than a vec4, replicate the last channel out. */
1859 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1860 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1861 else
1862 src.swizzle = BRW_SWIZZLE_NOOP;
1863 src.type = brw_type_for_base_type(ir->type);
1864
1865 this->result = src;
1866 }
1867
1868 void
1869 vec4_visitor::visit(ir_dereference_record *ir)
1870 {
1871 unsigned int i;
1872 const glsl_type *struct_type = ir->record->type;
1873 int offset = 0;
1874
1875 ir->record->accept(this);
1876
1877 for (i = 0; i < struct_type->length; i++) {
1878 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1879 break;
1880 offset += type_size(struct_type->fields.structure[i].type);
1881 }
1882
1883 /* If the type is smaller than a vec4, replicate the last channel out. */
1884 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1885 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1886 else
1887 this->result.swizzle = BRW_SWIZZLE_NOOP;
1888 this->result.type = brw_type_for_base_type(ir->type);
1889
1890 this->result.reg_offset += offset;
1891 }
1892
1893 /**
1894 * We want to be careful in assignment setup to hit the actual storage
1895 * instead of potentially using a temporary like we might with the
1896 * ir_dereference handler.
1897 */
1898 static dst_reg
1899 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1900 {
1901 /* The LHS must be a dereference. If the LHS is a variable indexed array
1902 * access of a vector, it must be separated into a series conditional moves
1903 * before reaching this point (see ir_vec_index_to_cond_assign).
1904 */
1905 assert(ir->as_dereference());
1906 ir_dereference_array *deref_array = ir->as_dereference_array();
1907 if (deref_array) {
1908 assert(!deref_array->array->type->is_vector());
1909 }
1910
1911 /* Use the rvalue deref handler for the most part. We'll ignore
1912 * swizzles in it and write swizzles using writemask, though.
1913 */
1914 ir->accept(v);
1915 return dst_reg(v->result);
1916 }
1917
1918 void
1919 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1920 const struct glsl_type *type, uint32_t predicate)
1921 {
1922 if (type->base_type == GLSL_TYPE_STRUCT) {
1923 for (unsigned int i = 0; i < type->length; i++) {
1924 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1925 }
1926 return;
1927 }
1928
1929 if (type->is_array()) {
1930 for (unsigned int i = 0; i < type->length; i++) {
1931 emit_block_move(dst, src, type->fields.array, predicate);
1932 }
1933 return;
1934 }
1935
1936 if (type->is_matrix()) {
1937 const struct glsl_type *vec_type;
1938
1939 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1940 type->vector_elements, 1);
1941
1942 for (int i = 0; i < type->matrix_columns; i++) {
1943 emit_block_move(dst, src, vec_type, predicate);
1944 }
1945 return;
1946 }
1947
1948 assert(type->is_scalar() || type->is_vector());
1949
1950 dst->type = brw_type_for_base_type(type);
1951 src->type = dst->type;
1952
1953 dst->writemask = (1 << type->vector_elements) - 1;
1954
1955 src->swizzle = swizzle_for_size(type->vector_elements);
1956
1957 vec4_instruction *inst = emit(MOV(*dst, *src));
1958 inst->predicate = predicate;
1959
1960 dst->reg_offset++;
1961 src->reg_offset++;
1962 }
1963
1964
1965 /* If the RHS processing resulted in an instruction generating a
1966 * temporary value, and it would be easy to rewrite the instruction to
1967 * generate its result right into the LHS instead, do so. This ends
1968 * up reliably removing instructions where it can be tricky to do so
1969 * later without real UD chain information.
1970 */
1971 bool
1972 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1973 dst_reg dst,
1974 src_reg src,
1975 vec4_instruction *pre_rhs_inst,
1976 vec4_instruction *last_rhs_inst)
1977 {
1978 /* This could be supported, but it would take more smarts. */
1979 if (ir->condition)
1980 return false;
1981
1982 if (pre_rhs_inst == last_rhs_inst)
1983 return false; /* No instructions generated to work with. */
1984
1985 /* Make sure the last instruction generated our source reg. */
1986 if (src.file != GRF ||
1987 src.file != last_rhs_inst->dst.file ||
1988 src.reg != last_rhs_inst->dst.reg ||
1989 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1990 src.reladdr ||
1991 src.abs ||
1992 src.negate ||
1993 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1994 return false;
1995
1996 /* Check that that last instruction fully initialized the channels
1997 * we want to use, in the order we want to use them. We could
1998 * potentially reswizzle the operands of many instructions so that
1999 * we could handle out of order channels, but don't yet.
2000 */
2001
2002 for (unsigned i = 0; i < 4; i++) {
2003 if (dst.writemask & (1 << i)) {
2004 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2005 return false;
2006
2007 if (BRW_GET_SWZ(src.swizzle, i) != i)
2008 return false;
2009 }
2010 }
2011
2012 /* Success! Rewrite the instruction. */
2013 last_rhs_inst->dst.file = dst.file;
2014 last_rhs_inst->dst.reg = dst.reg;
2015 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2016 last_rhs_inst->dst.reladdr = dst.reladdr;
2017 last_rhs_inst->dst.writemask &= dst.writemask;
2018
2019 return true;
2020 }
2021
2022 void
2023 vec4_visitor::visit(ir_assignment *ir)
2024 {
2025 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2026 uint32_t predicate = BRW_PREDICATE_NONE;
2027
2028 if (!ir->lhs->type->is_scalar() &&
2029 !ir->lhs->type->is_vector()) {
2030 ir->rhs->accept(this);
2031 src_reg src = this->result;
2032
2033 if (ir->condition) {
2034 emit_bool_to_cond_code(ir->condition, &predicate);
2035 }
2036
2037 /* emit_block_move doesn't account for swizzles in the source register.
2038 * This should be ok, since the source register is a structure or an
2039 * array, and those can't be swizzled. But double-check to be sure.
2040 */
2041 assert(src.swizzle ==
2042 (ir->rhs->type->is_matrix()
2043 ? swizzle_for_size(ir->rhs->type->vector_elements)
2044 : BRW_SWIZZLE_NOOP));
2045
2046 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2047 return;
2048 }
2049
2050 /* Now we're down to just a scalar/vector with writemasks. */
2051 int i;
2052
2053 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2054 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2055
2056 ir->rhs->accept(this);
2057
2058 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2059
2060 src_reg src = this->result;
2061
2062 int swizzles[4];
2063 int first_enabled_chan = 0;
2064 int src_chan = 0;
2065
2066 assert(ir->lhs->type->is_vector() ||
2067 ir->lhs->type->is_scalar());
2068 dst.writemask = ir->write_mask;
2069
2070 for (int i = 0; i < 4; i++) {
2071 if (dst.writemask & (1 << i)) {
2072 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2073 break;
2074 }
2075 }
2076
2077 /* Swizzle a small RHS vector into the channels being written.
2078 *
2079 * glsl ir treats write_mask as dictating how many channels are
2080 * present on the RHS while in our instructions we need to make
2081 * those channels appear in the slots of the vec4 they're written to.
2082 */
2083 for (int i = 0; i < 4; i++) {
2084 if (dst.writemask & (1 << i))
2085 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2086 else
2087 swizzles[i] = first_enabled_chan;
2088 }
2089 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2090 swizzles[2], swizzles[3]);
2091
2092 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2093 return;
2094 }
2095
2096 if (ir->condition) {
2097 emit_bool_to_cond_code(ir->condition, &predicate);
2098 }
2099
2100 for (i = 0; i < type_size(ir->lhs->type); i++) {
2101 vec4_instruction *inst = emit(MOV(dst, src));
2102 inst->predicate = predicate;
2103
2104 dst.reg_offset++;
2105 src.reg_offset++;
2106 }
2107 }
2108
2109 void
2110 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2111 {
2112 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2113 foreach_list(node, &ir->components) {
2114 ir_constant *field_value = (ir_constant *)node;
2115
2116 emit_constant_values(dst, field_value);
2117 }
2118 return;
2119 }
2120
2121 if (ir->type->is_array()) {
2122 for (unsigned int i = 0; i < ir->type->length; i++) {
2123 emit_constant_values(dst, ir->array_elements[i]);
2124 }
2125 return;
2126 }
2127
2128 if (ir->type->is_matrix()) {
2129 for (int i = 0; i < ir->type->matrix_columns; i++) {
2130 float *vec = &ir->value.f[i * ir->type->vector_elements];
2131
2132 for (int j = 0; j < ir->type->vector_elements; j++) {
2133 dst->writemask = 1 << j;
2134 dst->type = BRW_REGISTER_TYPE_F;
2135
2136 emit(MOV(*dst, src_reg(vec[j])));
2137 }
2138 dst->reg_offset++;
2139 }
2140 return;
2141 }
2142
2143 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2144
2145 for (int i = 0; i < ir->type->vector_elements; i++) {
2146 if (!(remaining_writemask & (1 << i)))
2147 continue;
2148
2149 dst->writemask = 1 << i;
2150 dst->type = brw_type_for_base_type(ir->type);
2151
2152 /* Find other components that match the one we're about to
2153 * write. Emits fewer instructions for things like vec4(0.5,
2154 * 1.5, 1.5, 1.5).
2155 */
2156 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2157 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2158 if (ir->value.b[i] == ir->value.b[j])
2159 dst->writemask |= (1 << j);
2160 } else {
2161 /* u, i, and f storage all line up, so no need for a
2162 * switch case for comparing each type.
2163 */
2164 if (ir->value.u[i] == ir->value.u[j])
2165 dst->writemask |= (1 << j);
2166 }
2167 }
2168
2169 switch (ir->type->base_type) {
2170 case GLSL_TYPE_FLOAT:
2171 emit(MOV(*dst, src_reg(ir->value.f[i])));
2172 break;
2173 case GLSL_TYPE_INT:
2174 emit(MOV(*dst, src_reg(ir->value.i[i])));
2175 break;
2176 case GLSL_TYPE_UINT:
2177 emit(MOV(*dst, src_reg(ir->value.u[i])));
2178 break;
2179 case GLSL_TYPE_BOOL:
2180 emit(MOV(*dst, src_reg(ir->value.b[i])));
2181 break;
2182 default:
2183 assert(!"Non-float/uint/int/bool constant");
2184 break;
2185 }
2186
2187 remaining_writemask &= ~dst->writemask;
2188 }
2189 dst->reg_offset++;
2190 }
2191
2192 void
2193 vec4_visitor::visit(ir_constant *ir)
2194 {
2195 dst_reg dst = dst_reg(this, ir->type);
2196 this->result = src_reg(dst);
2197
2198 emit_constant_values(&dst, ir);
2199 }
2200
2201 void
2202 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2203 {
2204 ir_dereference *deref = static_cast<ir_dereference *>(
2205 ir->actual_parameters.get_head());
2206 ir_variable *location = deref->variable_referenced();
2207 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2208 location->data.atomic.buffer_index);
2209
2210 /* Calculate the surface offset */
2211 src_reg offset(this, glsl_type::uint_type);
2212 ir_dereference_array *deref_array = deref->as_dereference_array();
2213 if (deref_array) {
2214 deref_array->array_index->accept(this);
2215
2216 src_reg tmp(this, glsl_type::uint_type);
2217 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2218 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2219 } else {
2220 offset = location->data.atomic.offset;
2221 }
2222
2223 /* Emit the appropriate machine instruction */
2224 const char *callee = ir->callee->function_name();
2225 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2226
2227 if (!strcmp("__intrinsic_atomic_read", callee)) {
2228 emit_untyped_surface_read(surf_index, dst, offset);
2229
2230 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2231 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2232 src_reg(), src_reg());
2233
2234 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2235 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2236 src_reg(), src_reg());
2237 }
2238 }
2239
2240 void
2241 vec4_visitor::visit(ir_call *ir)
2242 {
2243 const char *callee = ir->callee->function_name();
2244
2245 if (!strcmp("__intrinsic_atomic_read", callee) ||
2246 !strcmp("__intrinsic_atomic_increment", callee) ||
2247 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2248 visit_atomic_counter_intrinsic(ir);
2249 } else {
2250 assert(!"Unsupported intrinsic.");
2251 }
2252 }
2253
2254 src_reg
2255 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2256 {
2257 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2258 inst->base_mrf = 2;
2259 inst->mlen = 1;
2260 inst->sampler = sampler;
2261 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2262 inst->dst.writemask = WRITEMASK_XYZW;
2263
2264 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2265 int param_base = inst->base_mrf;
2266 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2267 int zero_mask = 0xf & ~coord_mask;
2268
2269 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2270 coordinate));
2271
2272 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2273 src_reg(0)));
2274
2275 emit(inst);
2276 return src_reg(inst->dst);
2277 }
2278
2279 void
2280 vec4_visitor::visit(ir_texture *ir)
2281 {
2282 int sampler =
2283 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2284
2285 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2286 * emitting anything other than setting up the constant result.
2287 */
2288 if (ir->op == ir_tg4) {
2289 ir_constant *chan = ir->lod_info.component->as_constant();
2290 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2291 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2292 dst_reg result(this, ir->type);
2293 this->result = src_reg(result);
2294 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2295 return;
2296 }
2297 }
2298
2299 /* Should be lowered by do_lower_texture_projection */
2300 assert(!ir->projector);
2301
2302 /* Should be lowered */
2303 assert(!ir->offset || !ir->offset->type->is_array());
2304
2305 /* Generate code to compute all the subexpression trees. This has to be
2306 * done before loading any values into MRFs for the sampler message since
2307 * generating these values may involve SEND messages that need the MRFs.
2308 */
2309 src_reg coordinate;
2310 if (ir->coordinate) {
2311 ir->coordinate->accept(this);
2312 coordinate = this->result;
2313 }
2314
2315 src_reg shadow_comparitor;
2316 if (ir->shadow_comparitor) {
2317 ir->shadow_comparitor->accept(this);
2318 shadow_comparitor = this->result;
2319 }
2320
2321 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2322 src_reg offset_value;
2323 if (has_nonconstant_offset) {
2324 ir->offset->accept(this);
2325 offset_value = src_reg(this->result);
2326 }
2327
2328 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2329 src_reg lod, dPdx, dPdy, sample_index, mcs;
2330 switch (ir->op) {
2331 case ir_tex:
2332 lod = src_reg(0.0f);
2333 lod_type = glsl_type::float_type;
2334 break;
2335 case ir_txf:
2336 case ir_txl:
2337 case ir_txs:
2338 ir->lod_info.lod->accept(this);
2339 lod = this->result;
2340 lod_type = ir->lod_info.lod->type;
2341 break;
2342 case ir_query_levels:
2343 lod = src_reg(0);
2344 lod_type = glsl_type::int_type;
2345 break;
2346 case ir_txf_ms:
2347 ir->lod_info.sample_index->accept(this);
2348 sample_index = this->result;
2349 sample_index_type = ir->lod_info.sample_index->type;
2350
2351 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2352 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2353 else
2354 mcs = src_reg(0u);
2355 break;
2356 case ir_txd:
2357 ir->lod_info.grad.dPdx->accept(this);
2358 dPdx = this->result;
2359
2360 ir->lod_info.grad.dPdy->accept(this);
2361 dPdy = this->result;
2362
2363 lod_type = ir->lod_info.grad.dPdx->type;
2364 break;
2365 case ir_txb:
2366 case ir_lod:
2367 case ir_tg4:
2368 break;
2369 }
2370
2371 vec4_instruction *inst = NULL;
2372 switch (ir->op) {
2373 case ir_tex:
2374 case ir_txl:
2375 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2376 break;
2377 case ir_txd:
2378 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2379 break;
2380 case ir_txf:
2381 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2382 break;
2383 case ir_txf_ms:
2384 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2385 break;
2386 case ir_txs:
2387 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2388 break;
2389 case ir_tg4:
2390 if (has_nonconstant_offset)
2391 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2392 else
2393 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2394 break;
2395 case ir_query_levels:
2396 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2397 break;
2398 case ir_txb:
2399 assert(!"TXB is not valid for vertex shaders.");
2400 break;
2401 case ir_lod:
2402 assert(!"LOD is not valid for vertex shaders.");
2403 break;
2404 default:
2405 assert(!"Unrecognized tex op");
2406 }
2407
2408 if (ir->offset != NULL && ir->op != ir_txf)
2409 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2410
2411 /* Stuff the channel select bits in the top of the texture offset */
2412 if (ir->op == ir_tg4)
2413 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2414
2415 /* The message header is necessary for:
2416 * - Gen4 (always)
2417 * - Texel offsets
2418 * - Gather channel selection
2419 * - Sampler indices too large to fit in a 4-bit value.
2420 */
2421 inst->header_present =
2422 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2423 sampler >= 16;
2424 inst->base_mrf = 2;
2425 inst->mlen = inst->header_present + 1; /* always at least one */
2426 inst->sampler = sampler;
2427 inst->dst = dst_reg(this, ir->type);
2428 inst->dst.writemask = WRITEMASK_XYZW;
2429 inst->shadow_compare = ir->shadow_comparitor != NULL;
2430
2431 /* MRF for the first parameter */
2432 int param_base = inst->base_mrf + inst->header_present;
2433
2434 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2435 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2436 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2437 } else {
2438 /* Load the coordinate */
2439 /* FINISHME: gl_clamp_mask and saturate */
2440 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2441 int zero_mask = 0xf & ~coord_mask;
2442
2443 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2444 coordinate));
2445
2446 if (zero_mask != 0) {
2447 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2448 src_reg(0)));
2449 }
2450 /* Load the shadow comparitor */
2451 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2452 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2453 WRITEMASK_X),
2454 shadow_comparitor));
2455 inst->mlen++;
2456 }
2457
2458 /* Load the LOD info */
2459 if (ir->op == ir_tex || ir->op == ir_txl) {
2460 int mrf, writemask;
2461 if (brw->gen >= 5) {
2462 mrf = param_base + 1;
2463 if (ir->shadow_comparitor) {
2464 writemask = WRITEMASK_Y;
2465 /* mlen already incremented */
2466 } else {
2467 writemask = WRITEMASK_X;
2468 inst->mlen++;
2469 }
2470 } else /* brw->gen == 4 */ {
2471 mrf = param_base;
2472 writemask = WRITEMASK_W;
2473 }
2474 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2475 } else if (ir->op == ir_txf) {
2476 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2477 } else if (ir->op == ir_txf_ms) {
2478 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2479 sample_index));
2480 if (brw->gen >= 7)
2481 /* MCS data is in the first channel of `mcs`, but we need to get it into
2482 * the .y channel of the second vec4 of params, so replicate .x across
2483 * the whole vec4 and then mask off everything except .y
2484 */
2485 mcs.swizzle = BRW_SWIZZLE_XXXX;
2486 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2487 mcs));
2488 inst->mlen++;
2489 } else if (ir->op == ir_txd) {
2490 const glsl_type *type = lod_type;
2491
2492 if (brw->gen >= 5) {
2493 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2494 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2495 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2496 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2497 inst->mlen++;
2498
2499 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2500 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2501 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2502 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2503 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2504 inst->mlen++;
2505
2506 if (ir->shadow_comparitor) {
2507 emit(MOV(dst_reg(MRF, param_base + 2,
2508 ir->shadow_comparitor->type, WRITEMASK_Z),
2509 shadow_comparitor));
2510 }
2511 }
2512 } else /* brw->gen == 4 */ {
2513 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2514 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2515 inst->mlen += 2;
2516 }
2517 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2518 if (ir->shadow_comparitor) {
2519 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2520 shadow_comparitor));
2521 }
2522
2523 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2524 offset_value));
2525 inst->mlen++;
2526 }
2527 }
2528
2529 emit(inst);
2530
2531 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2532 * spec requires layers.
2533 */
2534 if (ir->op == ir_txs) {
2535 glsl_type const *type = ir->sampler->type;
2536 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2537 type->sampler_array) {
2538 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2539 writemask(inst->dst, WRITEMASK_Z),
2540 src_reg(inst->dst), src_reg(6));
2541 }
2542 }
2543
2544 if (brw->gen == 6 && ir->op == ir_tg4) {
2545 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2546 }
2547
2548 swizzle_result(ir, src_reg(inst->dst), sampler);
2549 }
2550
2551 /**
2552 * Apply workarounds for Gen6 gather with UINT/SINT
2553 */
2554 void
2555 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2556 {
2557 if (!wa)
2558 return;
2559
2560 int width = (wa & WA_8BIT) ? 8 : 16;
2561 dst_reg dst_f = dst;
2562 dst_f.type = BRW_REGISTER_TYPE_F;
2563
2564 /* Convert from UNORM to UINT */
2565 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2566 emit(MOV(dst, src_reg(dst_f)));
2567
2568 if (wa & WA_SIGN) {
2569 /* Reinterpret the UINT value as a signed INT value by
2570 * shifting the sign bit into place, then shifting back
2571 * preserving sign.
2572 */
2573 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2574 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2575 }
2576 }
2577
2578 /**
2579 * Set up the gather channel based on the swizzle, for gather4.
2580 */
2581 uint32_t
2582 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2583 {
2584 ir_constant *chan = ir->lod_info.component->as_constant();
2585 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2586 switch (swiz) {
2587 case SWIZZLE_X: return 0;
2588 case SWIZZLE_Y:
2589 /* gather4 sampler is broken for green channel on RG32F --
2590 * we must ask for blue instead.
2591 */
2592 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2593 return 2;
2594 return 1;
2595 case SWIZZLE_Z: return 2;
2596 case SWIZZLE_W: return 3;
2597 default:
2598 assert(!"Not reached"); /* zero, one swizzles handled already */
2599 return 0;
2600 }
2601 }
2602
2603 void
2604 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2605 {
2606 int s = key->tex.swizzles[sampler];
2607
2608 this->result = src_reg(this, ir->type);
2609 dst_reg swizzled_result(this->result);
2610
2611 if (ir->op == ir_query_levels) {
2612 /* # levels is in .w */
2613 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2614 emit(MOV(swizzled_result, orig_val));
2615 return;
2616 }
2617
2618 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2619 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2620 emit(MOV(swizzled_result, orig_val));
2621 return;
2622 }
2623
2624
2625 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2626 int swizzle[4] = {0};
2627
2628 for (int i = 0; i < 4; i++) {
2629 switch (GET_SWZ(s, i)) {
2630 case SWIZZLE_ZERO:
2631 zero_mask |= (1 << i);
2632 break;
2633 case SWIZZLE_ONE:
2634 one_mask |= (1 << i);
2635 break;
2636 default:
2637 copy_mask |= (1 << i);
2638 swizzle[i] = GET_SWZ(s, i);
2639 break;
2640 }
2641 }
2642
2643 if (copy_mask) {
2644 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2645 swizzled_result.writemask = copy_mask;
2646 emit(MOV(swizzled_result, orig_val));
2647 }
2648
2649 if (zero_mask) {
2650 swizzled_result.writemask = zero_mask;
2651 emit(MOV(swizzled_result, src_reg(0.0f)));
2652 }
2653
2654 if (one_mask) {
2655 swizzled_result.writemask = one_mask;
2656 emit(MOV(swizzled_result, src_reg(1.0f)));
2657 }
2658 }
2659
2660 void
2661 vec4_visitor::visit(ir_return *ir)
2662 {
2663 assert(!"not reached");
2664 }
2665
2666 void
2667 vec4_visitor::visit(ir_discard *ir)
2668 {
2669 assert(!"not reached");
2670 }
2671
2672 void
2673 vec4_visitor::visit(ir_if *ir)
2674 {
2675 /* Don't point the annotation at the if statement, because then it plus
2676 * the then and else blocks get printed.
2677 */
2678 this->base_ir = ir->condition;
2679
2680 if (brw->gen == 6) {
2681 emit_if_gen6(ir);
2682 } else {
2683 uint32_t predicate;
2684 emit_bool_to_cond_code(ir->condition, &predicate);
2685 emit(IF(predicate));
2686 }
2687
2688 visit_instructions(&ir->then_instructions);
2689
2690 if (!ir->else_instructions.is_empty()) {
2691 this->base_ir = ir->condition;
2692 emit(BRW_OPCODE_ELSE);
2693
2694 visit_instructions(&ir->else_instructions);
2695 }
2696
2697 this->base_ir = ir->condition;
2698 emit(BRW_OPCODE_ENDIF);
2699 }
2700
2701 void
2702 vec4_visitor::visit(ir_emit_vertex *)
2703 {
2704 assert(!"not reached");
2705 }
2706
2707 void
2708 vec4_visitor::visit(ir_end_primitive *)
2709 {
2710 assert(!"not reached");
2711 }
2712
2713 void
2714 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2715 dst_reg dst, src_reg offset,
2716 src_reg src0, src_reg src1)
2717 {
2718 unsigned mlen = 0;
2719
2720 /* Set the atomic operation offset. */
2721 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2722 mlen++;
2723
2724 /* Set the atomic operation arguments. */
2725 if (src0.file != BAD_FILE) {
2726 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2727 mlen++;
2728 }
2729
2730 if (src1.file != BAD_FILE) {
2731 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2732 mlen++;
2733 }
2734
2735 /* Emit the instruction. Note that this maps to the normal SIMD8
2736 * untyped atomic message on Ivy Bridge, but that's OK because
2737 * unused channels will be masked out.
2738 */
2739 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2740 src_reg(atomic_op), src_reg(surf_index));
2741 inst->base_mrf = 0;
2742 inst->mlen = mlen;
2743 }
2744
2745 void
2746 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2747 src_reg offset)
2748 {
2749 /* Set the surface read offset. */
2750 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2751
2752 /* Emit the instruction. Note that this maps to the normal SIMD8
2753 * untyped surface read message, but that's OK because unused
2754 * channels will be masked out.
2755 */
2756 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2757 dst, src_reg(surf_index));
2758 inst->base_mrf = 0;
2759 inst->mlen = 1;
2760 }
2761
2762 void
2763 vec4_visitor::emit_ndc_computation()
2764 {
2765 /* Get the position */
2766 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2767
2768 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2769 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2770 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2771
2772 current_annotation = "NDC";
2773 dst_reg ndc_w = ndc;
2774 ndc_w.writemask = WRITEMASK_W;
2775 src_reg pos_w = pos;
2776 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2777 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2778
2779 dst_reg ndc_xyz = ndc;
2780 ndc_xyz.writemask = WRITEMASK_XYZ;
2781
2782 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2783 }
2784
2785 void
2786 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2787 {
2788 if (brw->gen < 6 &&
2789 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2790 key->userclip_active || brw->has_negative_rhw_bug)) {
2791 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2792 dst_reg header1_w = header1;
2793 header1_w.writemask = WRITEMASK_W;
2794
2795 emit(MOV(header1, 0u));
2796
2797 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2798 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2799
2800 current_annotation = "Point size";
2801 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2802 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2803 }
2804
2805 if (key->userclip_active) {
2806 current_annotation = "Clipping flags";
2807 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2808 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2809
2810 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2811 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2812 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2813
2814 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2815 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2816 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2817 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2818 }
2819
2820 /* i965 clipping workaround:
2821 * 1) Test for -ve rhw
2822 * 2) If set,
2823 * set ndc = (0,0,0,0)
2824 * set ucp[6] = 1
2825 *
2826 * Later, clipping will detect ucp[6] and ensure the primitive is
2827 * clipped against all fixed planes.
2828 */
2829 if (brw->has_negative_rhw_bug) {
2830 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2831 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2832 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2833 vec4_instruction *inst;
2834 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2835 inst->predicate = BRW_PREDICATE_NORMAL;
2836 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2837 inst->predicate = BRW_PREDICATE_NORMAL;
2838 }
2839
2840 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2841 } else if (brw->gen < 6) {
2842 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2843 } else {
2844 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2845 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2846 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2847 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2848 }
2849 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2850 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2851 src_reg(output_reg[VARYING_SLOT_LAYER])));
2852 }
2853 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2854 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2855 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2856 }
2857 }
2858 }
2859
2860 void
2861 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2862 {
2863 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2864 *
2865 * "If a linked set of shaders forming the vertex stage contains no
2866 * static write to gl_ClipVertex or gl_ClipDistance, but the
2867 * application has requested clipping against user clip planes through
2868 * the API, then the coordinate written to gl_Position is used for
2869 * comparison against the user clip planes."
2870 *
2871 * This function is only called if the shader didn't write to
2872 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2873 * if the user wrote to it; otherwise we use gl_Position.
2874 */
2875 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2876 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2877 clip_vertex = VARYING_SLOT_POS;
2878 }
2879
2880 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2881 ++i) {
2882 reg.writemask = 1 << i;
2883 emit(DP4(reg,
2884 src_reg(output_reg[clip_vertex]),
2885 src_reg(this->userplane[i + offset])));
2886 }
2887 }
2888
2889 void
2890 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2891 {
2892 assert (varying < VARYING_SLOT_MAX);
2893 reg.type = output_reg[varying].type;
2894 current_annotation = output_reg_annotation[varying];
2895 /* Copy the register, saturating if necessary */
2896 vec4_instruction *inst = emit(MOV(reg,
2897 src_reg(output_reg[varying])));
2898 if ((varying == VARYING_SLOT_COL0 ||
2899 varying == VARYING_SLOT_COL1 ||
2900 varying == VARYING_SLOT_BFC0 ||
2901 varying == VARYING_SLOT_BFC1) &&
2902 key->clamp_vertex_color) {
2903 inst->saturate = true;
2904 }
2905 }
2906
2907 void
2908 vec4_visitor::emit_urb_slot(int mrf, int varying)
2909 {
2910 struct brw_reg hw_reg = brw_message_reg(mrf);
2911 dst_reg reg = dst_reg(MRF, mrf);
2912 reg.type = BRW_REGISTER_TYPE_F;
2913
2914 switch (varying) {
2915 case VARYING_SLOT_PSIZ:
2916 /* PSIZ is always in slot 0, and is coupled with other flags. */
2917 current_annotation = "indices, point width, clip flags";
2918 emit_psiz_and_flags(hw_reg);
2919 break;
2920 case BRW_VARYING_SLOT_NDC:
2921 current_annotation = "NDC";
2922 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2923 break;
2924 case VARYING_SLOT_POS:
2925 current_annotation = "gl_Position";
2926 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2927 break;
2928 case VARYING_SLOT_EDGE:
2929 /* This is present when doing unfilled polygons. We're supposed to copy
2930 * the edge flag from the user-provided vertex array
2931 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2932 * of that attribute (starts as 1.0f). This is then used in clipping to
2933 * determine which edges should be drawn as wireframe.
2934 */
2935 current_annotation = "edge flag";
2936 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2937 glsl_type::float_type, WRITEMASK_XYZW))));
2938 break;
2939 case BRW_VARYING_SLOT_PAD:
2940 /* No need to write to this slot */
2941 break;
2942 default:
2943 emit_generic_urb_slot(reg, varying);
2944 break;
2945 }
2946 }
2947
2948 static int
2949 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2950 {
2951 if (brw->gen >= 6) {
2952 /* URB data written (does not include the message header reg) must
2953 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2954 * section 5.4.3.2.2: URB_INTERLEAVED.
2955 *
2956 * URB entries are allocated on a multiple of 1024 bits, so an
2957 * extra 128 bits written here to make the end align to 256 is
2958 * no problem.
2959 */
2960 if ((mlen % 2) != 1)
2961 mlen++;
2962 }
2963
2964 return mlen;
2965 }
2966
2967
2968 /**
2969 * Generates the VUE payload plus the necessary URB write instructions to
2970 * output it.
2971 *
2972 * The VUE layout is documented in Volume 2a.
2973 */
2974 void
2975 vec4_visitor::emit_vertex()
2976 {
2977 /* MRF 0 is reserved for the debugger, so start with message header
2978 * in MRF 1.
2979 */
2980 int base_mrf = 1;
2981 int mrf = base_mrf;
2982 /* In the process of generating our URB write message contents, we
2983 * may need to unspill a register or load from an array. Those
2984 * reads would use MRFs 14-15.
2985 */
2986 int max_usable_mrf = 13;
2987
2988 /* The following assertion verifies that max_usable_mrf causes an
2989 * even-numbered amount of URB write data, which will meet gen6's
2990 * requirements for length alignment.
2991 */
2992 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2993
2994 /* First mrf is the g0-based message header containing URB handles and
2995 * such.
2996 */
2997 emit_urb_write_header(mrf++);
2998
2999 if (brw->gen < 6) {
3000 emit_ndc_computation();
3001 }
3002
3003 /* Lower legacy ff and ClipVertex clipping to clip distances */
3004 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3005 current_annotation = "user clip distances";
3006
3007 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3008 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3009
3010 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3011 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3012 }
3013
3014 /* We may need to split this up into several URB writes, so do them in a
3015 * loop.
3016 */
3017 int slot = 0;
3018 bool complete = false;
3019 do {
3020 /* URB offset is in URB row increments, and each of our MRFs is half of
3021 * one of those, since we're doing interleaved writes.
3022 */
3023 int offset = slot / 2;
3024
3025 mrf = base_mrf + 1;
3026 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3027 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3028
3029 /* If this was max_usable_mrf, we can't fit anything more into this
3030 * URB WRITE.
3031 */
3032 if (mrf > max_usable_mrf) {
3033 slot++;
3034 break;
3035 }
3036 }
3037
3038 complete = slot >= prog_data->vue_map.num_slots;
3039 current_annotation = "URB write";
3040 vec4_instruction *inst = emit_urb_write_opcode(complete);
3041 inst->base_mrf = base_mrf;
3042 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3043 inst->offset += offset;
3044 } while(!complete);
3045 }
3046
3047
3048 src_reg
3049 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3050 src_reg *reladdr, int reg_offset)
3051 {
3052 /* Because we store the values to scratch interleaved like our
3053 * vertex data, we need to scale the vec4 index by 2.
3054 */
3055 int message_header_scale = 2;
3056
3057 /* Pre-gen6, the message header uses byte offsets instead of vec4
3058 * (16-byte) offset units.
3059 */
3060 if (brw->gen < 6)
3061 message_header_scale *= 16;
3062
3063 if (reladdr) {
3064 src_reg index = src_reg(this, glsl_type::int_type);
3065
3066 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3067 emit_before(inst, MUL(dst_reg(index),
3068 index, src_reg(message_header_scale)));
3069
3070 return index;
3071 } else {
3072 return src_reg(reg_offset * message_header_scale);
3073 }
3074 }
3075
3076 src_reg
3077 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3078 src_reg *reladdr, int reg_offset)
3079 {
3080 if (reladdr) {
3081 src_reg index = src_reg(this, glsl_type::int_type);
3082
3083 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3084
3085 /* Pre-gen6, the message header uses byte offsets instead of vec4
3086 * (16-byte) offset units.
3087 */
3088 if (brw->gen < 6) {
3089 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3090 }
3091
3092 return index;
3093 } else if (brw->gen >= 8) {
3094 /* Store the offset in a GRF so we can send-from-GRF. */
3095 src_reg offset = src_reg(this, glsl_type::int_type);
3096 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3097 return offset;
3098 } else {
3099 int message_header_scale = brw->gen < 6 ? 16 : 1;
3100 return src_reg(reg_offset * message_header_scale);
3101 }
3102 }
3103
3104 /**
3105 * Emits an instruction before @inst to load the value named by @orig_src
3106 * from scratch space at @base_offset to @temp.
3107 *
3108 * @base_offset is measured in 32-byte units (the size of a register).
3109 */
3110 void
3111 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3112 dst_reg temp, src_reg orig_src,
3113 int base_offset)
3114 {
3115 int reg_offset = base_offset + orig_src.reg_offset;
3116 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3117
3118 emit_before(inst, SCRATCH_READ(temp, index));
3119 }
3120
3121 /**
3122 * Emits an instruction after @inst to store the value to be written
3123 * to @orig_dst to scratch space at @base_offset, from @temp.
3124 *
3125 * @base_offset is measured in 32-byte units (the size of a register).
3126 */
3127 void
3128 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3129 {
3130 int reg_offset = base_offset + inst->dst.reg_offset;
3131 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3132
3133 /* Create a temporary register to store *inst's result in.
3134 *
3135 * We have to be careful in MOVing from our temporary result register in
3136 * the scratch write. If we swizzle from channels of the temporary that
3137 * weren't initialized, it will confuse live interval analysis, which will
3138 * make spilling fail to make progress.
3139 */
3140 src_reg temp = src_reg(this, glsl_type::vec4_type);
3141 temp.type = inst->dst.type;
3142 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3143 int swizzles[4];
3144 for (int i = 0; i < 4; i++)
3145 if (inst->dst.writemask & (1 << i))
3146 swizzles[i] = i;
3147 else
3148 swizzles[i] = first_writemask_chan;
3149 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3150 swizzles[2], swizzles[3]);
3151
3152 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3153 inst->dst.writemask));
3154 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3155 write->predicate = inst->predicate;
3156 write->ir = inst->ir;
3157 write->annotation = inst->annotation;
3158 inst->insert_after(write);
3159
3160 inst->dst.file = temp.file;
3161 inst->dst.reg = temp.reg;
3162 inst->dst.reg_offset = temp.reg_offset;
3163 inst->dst.reladdr = NULL;
3164 }
3165
3166 /**
3167 * We can't generally support array access in GRF space, because a
3168 * single instruction's destination can only span 2 contiguous
3169 * registers. So, we send all GRF arrays that get variable index
3170 * access to scratch space.
3171 */
3172 void
3173 vec4_visitor::move_grf_array_access_to_scratch()
3174 {
3175 int scratch_loc[this->virtual_grf_count];
3176
3177 for (int i = 0; i < this->virtual_grf_count; i++) {
3178 scratch_loc[i] = -1;
3179 }
3180
3181 /* First, calculate the set of virtual GRFs that need to be punted
3182 * to scratch due to having any array access on them, and where in
3183 * scratch.
3184 */
3185 foreach_list(node, &this->instructions) {
3186 vec4_instruction *inst = (vec4_instruction *)node;
3187
3188 if (inst->dst.file == GRF && inst->dst.reladdr &&
3189 scratch_loc[inst->dst.reg] == -1) {
3190 scratch_loc[inst->dst.reg] = c->last_scratch;
3191 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3192 }
3193
3194 for (int i = 0 ; i < 3; i++) {
3195 src_reg *src = &inst->src[i];
3196
3197 if (src->file == GRF && src->reladdr &&
3198 scratch_loc[src->reg] == -1) {
3199 scratch_loc[src->reg] = c->last_scratch;
3200 c->last_scratch += this->virtual_grf_sizes[src->reg];
3201 }
3202 }
3203 }
3204
3205 /* Now, for anything that will be accessed through scratch, rewrite
3206 * it to load/store. Note that this is a _safe list walk, because
3207 * we may generate a new scratch_write instruction after the one
3208 * we're processing.
3209 */
3210 foreach_list_safe(node, &this->instructions) {
3211 vec4_instruction *inst = (vec4_instruction *)node;
3212
3213 /* Set up the annotation tracking for new generated instructions. */
3214 base_ir = inst->ir;
3215 current_annotation = inst->annotation;
3216
3217 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3218 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3219 }
3220
3221 for (int i = 0 ; i < 3; i++) {
3222 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3223 continue;
3224
3225 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3226
3227 emit_scratch_read(inst, temp, inst->src[i],
3228 scratch_loc[inst->src[i].reg]);
3229
3230 inst->src[i].file = temp.file;
3231 inst->src[i].reg = temp.reg;
3232 inst->src[i].reg_offset = temp.reg_offset;
3233 inst->src[i].reladdr = NULL;
3234 }
3235 }
3236 }
3237
3238 /**
3239 * Emits an instruction before @inst to load the value named by @orig_src
3240 * from the pull constant buffer (surface) at @base_offset to @temp.
3241 */
3242 void
3243 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3244 dst_reg temp, src_reg orig_src,
3245 int base_offset)
3246 {
3247 int reg_offset = base_offset + orig_src.reg_offset;
3248 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3249 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3250 vec4_instruction *load;
3251
3252 if (brw->gen >= 7) {
3253 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3254 grf_offset.type = offset.type;
3255 emit_before(inst, MOV(grf_offset, offset));
3256
3257 load = new(mem_ctx) vec4_instruction(this,
3258 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3259 temp, index, src_reg(grf_offset));
3260 } else {
3261 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3262 temp, index, offset);
3263 load->base_mrf = 14;
3264 load->mlen = 1;
3265 }
3266 emit_before(inst, load);
3267 }
3268
3269 /**
3270 * Implements array access of uniforms by inserting a
3271 * PULL_CONSTANT_LOAD instruction.
3272 *
3273 * Unlike temporary GRF array access (where we don't support it due to
3274 * the difficulty of doing relative addressing on instruction
3275 * destinations), we could potentially do array access of uniforms
3276 * that were loaded in GRF space as push constants. In real-world
3277 * usage we've seen, though, the arrays being used are always larger
3278 * than we could load as push constants, so just always move all
3279 * uniform array access out to a pull constant buffer.
3280 */
3281 void
3282 vec4_visitor::move_uniform_array_access_to_pull_constants()
3283 {
3284 int pull_constant_loc[this->uniforms];
3285
3286 for (int i = 0; i < this->uniforms; i++) {
3287 pull_constant_loc[i] = -1;
3288 }
3289
3290 /* Walk through and find array access of uniforms. Put a copy of that
3291 * uniform in the pull constant buffer.
3292 *
3293 * Note that we don't move constant-indexed accesses to arrays. No
3294 * testing has been done of the performance impact of this choice.
3295 */
3296 foreach_list_safe(node, &this->instructions) {
3297 vec4_instruction *inst = (vec4_instruction *)node;
3298
3299 for (int i = 0 ; i < 3; i++) {
3300 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3301 continue;
3302
3303 int uniform = inst->src[i].reg;
3304
3305 /* If this array isn't already present in the pull constant buffer,
3306 * add it.
3307 */
3308 if (pull_constant_loc[uniform] == -1) {
3309 const float **values = &stage_prog_data->param[uniform * 4];
3310
3311 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3312
3313 assert(uniform < uniform_array_size);
3314 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3315 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3316 = values[j];
3317 }
3318 }
3319
3320 /* Set up the annotation tracking for new generated instructions. */
3321 base_ir = inst->ir;
3322 current_annotation = inst->annotation;
3323
3324 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3325
3326 emit_pull_constant_load(inst, temp, inst->src[i],
3327 pull_constant_loc[uniform]);
3328
3329 inst->src[i].file = temp.file;
3330 inst->src[i].reg = temp.reg;
3331 inst->src[i].reg_offset = temp.reg_offset;
3332 inst->src[i].reladdr = NULL;
3333 }
3334 }
3335
3336 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3337 * no need to track them as larger-than-vec4 objects. This will be
3338 * relied on in cutting out unused uniform vectors from push
3339 * constants.
3340 */
3341 split_uniform_registers();
3342 }
3343
3344 void
3345 vec4_visitor::resolve_ud_negate(src_reg *reg)
3346 {
3347 if (reg->type != BRW_REGISTER_TYPE_UD ||
3348 !reg->negate)
3349 return;
3350
3351 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3352 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3353 *reg = temp;
3354 }
3355
3356 vec4_visitor::vec4_visitor(struct brw_context *brw,
3357 struct brw_vec4_compile *c,
3358 struct gl_program *prog,
3359 const struct brw_vec4_prog_key *key,
3360 struct brw_vec4_prog_data *prog_data,
3361 struct gl_shader_program *shader_prog,
3362 gl_shader_stage stage,
3363 void *mem_ctx,
3364 bool debug_flag,
3365 bool no_spills,
3366 shader_time_shader_type st_base,
3367 shader_time_shader_type st_written,
3368 shader_time_shader_type st_reset)
3369 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3370 c(c),
3371 key(key),
3372 prog_data(prog_data),
3373 sanity_param_count(0),
3374 fail_msg(NULL),
3375 first_non_payload_grf(0),
3376 need_all_constants_in_pull_buffer(false),
3377 debug_flag(debug_flag),
3378 no_spills(no_spills),
3379 st_base(st_base),
3380 st_written(st_written),
3381 st_reset(st_reset)
3382 {
3383 this->mem_ctx = mem_ctx;
3384 this->failed = false;
3385
3386 this->base_ir = NULL;
3387 this->current_annotation = NULL;
3388 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3389
3390 this->variable_ht = hash_table_ctor(0,
3391 hash_table_pointer_hash,
3392 hash_table_pointer_compare);
3393
3394 this->virtual_grf_start = NULL;
3395 this->virtual_grf_end = NULL;
3396 this->virtual_grf_sizes = NULL;
3397 this->virtual_grf_count = 0;
3398 this->virtual_grf_reg_map = NULL;
3399 this->virtual_grf_reg_count = 0;
3400 this->virtual_grf_array_size = 0;
3401 this->live_intervals_valid = false;
3402
3403 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3404
3405 this->uniforms = 0;
3406
3407 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3408 * at least one. See setup_uniforms() in brw_vec4.cpp.
3409 */
3410 this->uniform_array_size = 1;
3411 if (prog_data) {
3412 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3413 }
3414
3415 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3416 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3417 }
3418
3419 vec4_visitor::~vec4_visitor()
3420 {
3421 hash_table_dtor(this->variable_ht);
3422 }
3423
3424
3425 void
3426 vec4_visitor::fail(const char *format, ...)
3427 {
3428 va_list va;
3429 char *msg;
3430
3431 if (failed)
3432 return;
3433
3434 failed = true;
3435
3436 va_start(va, format);
3437 msg = ralloc_vasprintf(mem_ctx, format, va);
3438 va_end(va);
3439 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3440
3441 this->fail_msg = msg;
3442
3443 if (debug_flag) {
3444 fprintf(stderr, "%s", msg);
3445 }
3446 }
3447
3448 } /* namespace brw */