i965/vs: Add support for LRP instruction.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 #define ALU3(op) \
111 vec4_instruction * \
112 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
113 { \
114 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
115 src0, src1, src2); \
116 }
117
118 ALU1(NOT)
119 ALU1(MOV)
120 ALU1(FRC)
121 ALU1(RNDD)
122 ALU1(RNDE)
123 ALU1(RNDZ)
124 ALU1(F32TO16)
125 ALU1(F16TO32)
126 ALU2(ADD)
127 ALU2(MUL)
128 ALU2(MACH)
129 ALU2(AND)
130 ALU2(OR)
131 ALU2(XOR)
132 ALU2(DP3)
133 ALU2(DP4)
134 ALU2(DPH)
135 ALU2(SHL)
136 ALU2(SHR)
137 ALU2(ASR)
138 ALU3(LRP)
139
140 /** Gen4 predicated IF. */
141 vec4_instruction *
142 vec4_visitor::IF(uint32_t predicate)
143 {
144 vec4_instruction *inst;
145
146 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
147 inst->predicate = predicate;
148
149 return inst;
150 }
151
152 /** Gen6+ IF with embedded comparison. */
153 vec4_instruction *
154 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
155 {
156 assert(intel->gen >= 6);
157
158 vec4_instruction *inst;
159
160 resolve_ud_negate(&src0);
161 resolve_ud_negate(&src1);
162
163 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
164 src0, src1);
165 inst->conditional_mod = condition;
166
167 return inst;
168 }
169
170 /**
171 * CMP: Sets the low bit of the destination channels with the result
172 * of the comparison, while the upper bits are undefined, and updates
173 * the flag register with the packed 16 bits of the result.
174 */
175 vec4_instruction *
176 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
177 {
178 vec4_instruction *inst;
179
180 /* original gen4 does type conversion to the destination type
181 * before before comparison, producing garbage results for floating
182 * point comparisons.
183 */
184 if (intel->gen == 4) {
185 dst.type = src0.type;
186 if (dst.file == HW_REG)
187 dst.fixed_hw_reg.type = dst.type;
188 }
189
190 resolve_ud_negate(&src0);
191 resolve_ud_negate(&src1);
192
193 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
194 inst->conditional_mod = condition;
195
196 return inst;
197 }
198
199 vec4_instruction *
200 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
201 {
202 vec4_instruction *inst;
203
204 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
205 dst, index);
206 inst->base_mrf = 14;
207 inst->mlen = 2;
208
209 return inst;
210 }
211
212 vec4_instruction *
213 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
214 {
215 vec4_instruction *inst;
216
217 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
218 dst, src, index);
219 inst->base_mrf = 13;
220 inst->mlen = 3;
221
222 return inst;
223 }
224
225 void
226 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
227 {
228 static enum opcode dot_opcodes[] = {
229 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
230 };
231
232 emit(dot_opcodes[elements - 2], dst, src0, src1);
233 }
234
235 src_reg
236 vec4_visitor::fix_3src_operand(src_reg src)
237 {
238 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
239 * able to use vertical stride of zero to replicate the vec4 uniform, like
240 *
241 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
242 *
243 * But you can't, since vertical stride is always four in three-source
244 * instructions. Instead, insert a MOV instruction to do the replication so
245 * that the three-source instruction can consume it.
246 */
247
248 /* The MOV is only needed if the source is a uniform or immediate. */
249 if (src.file != UNIFORM && src.file != IMM)
250 return src;
251
252 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
253 expanded.type = src.type;
254 emit(MOV(expanded, src));
255 return src_reg(expanded);
256 }
257
258 src_reg
259 vec4_visitor::fix_math_operand(src_reg src)
260 {
261 /* The gen6 math instruction ignores the source modifiers --
262 * swizzle, abs, negate, and at least some parts of the register
263 * region description.
264 *
265 * Rather than trying to enumerate all these cases, *always* expand the
266 * operand to a temp GRF for gen6.
267 *
268 * For gen7, keep the operand as-is, except if immediate, which gen7 still
269 * can't use.
270 */
271
272 if (intel->gen == 7 && src.file != IMM)
273 return src;
274
275 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
276 expanded.type = src.type;
277 emit(MOV(expanded, src));
278 return src_reg(expanded);
279 }
280
281 void
282 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
283 {
284 src = fix_math_operand(src);
285
286 if (dst.writemask != WRITEMASK_XYZW) {
287 /* The gen6 math instruction must be align1, so we can't do
288 * writemasks.
289 */
290 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
291
292 emit(opcode, temp_dst, src);
293
294 emit(MOV(dst, src_reg(temp_dst)));
295 } else {
296 emit(opcode, dst, src);
297 }
298 }
299
300 void
301 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
302 {
303 vec4_instruction *inst = emit(opcode, dst, src);
304 inst->base_mrf = 1;
305 inst->mlen = 1;
306 }
307
308 void
309 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
310 {
311 switch (opcode) {
312 case SHADER_OPCODE_RCP:
313 case SHADER_OPCODE_RSQ:
314 case SHADER_OPCODE_SQRT:
315 case SHADER_OPCODE_EXP2:
316 case SHADER_OPCODE_LOG2:
317 case SHADER_OPCODE_SIN:
318 case SHADER_OPCODE_COS:
319 break;
320 default:
321 assert(!"not reached: bad math opcode");
322 return;
323 }
324
325 if (intel->gen >= 6) {
326 return emit_math1_gen6(opcode, dst, src);
327 } else {
328 return emit_math1_gen4(opcode, dst, src);
329 }
330 }
331
332 void
333 vec4_visitor::emit_math2_gen6(enum opcode opcode,
334 dst_reg dst, src_reg src0, src_reg src1)
335 {
336 src0 = fix_math_operand(src0);
337 src1 = fix_math_operand(src1);
338
339 if (dst.writemask != WRITEMASK_XYZW) {
340 /* The gen6 math instruction must be align1, so we can't do
341 * writemasks.
342 */
343 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
344 temp_dst.type = dst.type;
345
346 emit(opcode, temp_dst, src0, src1);
347
348 emit(MOV(dst, src_reg(temp_dst)));
349 } else {
350 emit(opcode, dst, src0, src1);
351 }
352 }
353
354 void
355 vec4_visitor::emit_math2_gen4(enum opcode opcode,
356 dst_reg dst, src_reg src0, src_reg src1)
357 {
358 vec4_instruction *inst = emit(opcode, dst, src0, src1);
359 inst->base_mrf = 1;
360 inst->mlen = 2;
361 }
362
363 void
364 vec4_visitor::emit_math(enum opcode opcode,
365 dst_reg dst, src_reg src0, src_reg src1)
366 {
367 switch (opcode) {
368 case SHADER_OPCODE_POW:
369 case SHADER_OPCODE_INT_QUOTIENT:
370 case SHADER_OPCODE_INT_REMAINDER:
371 break;
372 default:
373 assert(!"not reached: unsupported binary math opcode");
374 return;
375 }
376
377 if (intel->gen >= 6) {
378 return emit_math2_gen6(opcode, dst, src0, src1);
379 } else {
380 return emit_math2_gen4(opcode, dst, src0, src1);
381 }
382 }
383
384 void
385 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
386 {
387 if (intel->gen < 7)
388 assert(!"ir_unop_pack_half_2x16 should be lowered");
389
390 assert(dst.type == BRW_REGISTER_TYPE_UD);
391 assert(src0.type == BRW_REGISTER_TYPE_F);
392
393 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
394 *
395 * Because this instruction does not have a 16-bit floating-point type,
396 * the destination data type must be Word (W).
397 *
398 * The destination must be DWord-aligned and specify a horizontal stride
399 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
400 * each destination channel and the upper word is not modified.
401 *
402 * The above restriction implies that the f32to16 instruction must use
403 * align1 mode, because only in align1 mode is it possible to specify
404 * horizontal stride. We choose here to defy the hardware docs and emit
405 * align16 instructions.
406 *
407 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
408 * instructions. I was partially successful in that the code passed all
409 * tests. However, the code was dubiously correct and fragile, and the
410 * tests were not harsh enough to probe that frailty. Not trusting the
411 * code, I chose instead to remain in align16 mode in defiance of the hw
412 * docs).
413 *
414 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
415 * simulator, emitting a f32to16 in align16 mode with UD as destination
416 * data type is safe. The behavior differs from that specified in the PRM
417 * in that the upper word of each destination channel is cleared to 0.
418 */
419
420 dst_reg tmp_dst(this, glsl_type::uvec2_type);
421 src_reg tmp_src(tmp_dst);
422
423 #if 0
424 /* Verify the undocumented behavior on which the following instructions
425 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
426 * then the result of the bit-or instruction below will be incorrect.
427 *
428 * You should inspect the disasm output in order to verify that the MOV is
429 * not optimized away.
430 */
431 emit(MOV(tmp_dst, src_reg(0x12345678u)));
432 #endif
433
434 /* Give tmp the form below, where "." means untouched.
435 *
436 * w z y x w z y x
437 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
438 *
439 * That the upper word of each write-channel be 0 is required for the
440 * following bit-shift and bit-or instructions to work. Note that this
441 * relies on the undocumented hardware behavior mentioned above.
442 */
443 tmp_dst.writemask = WRITEMASK_XY;
444 emit(F32TO16(tmp_dst, src0));
445
446 /* Give the write-channels of dst the form:
447 * 0xhhhh0000
448 */
449 tmp_src.swizzle = SWIZZLE_Y;
450 emit(SHL(dst, tmp_src, src_reg(16u)));
451
452 /* Finally, give the write-channels of dst the form of packHalf2x16's
453 * output:
454 * 0xhhhhllll
455 */
456 tmp_src.swizzle = SWIZZLE_X;
457 emit(OR(dst, src_reg(dst), tmp_src));
458 }
459
460 void
461 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
462 {
463 if (intel->gen < 7)
464 assert(!"ir_unop_unpack_half_2x16 should be lowered");
465
466 assert(dst.type == BRW_REGISTER_TYPE_F);
467 assert(src0.type == BRW_REGISTER_TYPE_UD);
468
469 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
470 *
471 * Because this instruction does not have a 16-bit floating-point type,
472 * the source data type must be Word (W). The destination type must be
473 * F (Float).
474 *
475 * To use W as the source data type, we must adjust horizontal strides,
476 * which is only possible in align1 mode. All my [chadv] attempts at
477 * emitting align1 instructions for unpackHalf2x16 failed to pass the
478 * Piglit tests, so I gave up.
479 *
480 * I've verified that, on gen7 hardware and the simulator, it is safe to
481 * emit f16to32 in align16 mode with UD as source data type.
482 */
483
484 dst_reg tmp_dst(this, glsl_type::uvec2_type);
485 src_reg tmp_src(tmp_dst);
486
487 tmp_dst.writemask = WRITEMASK_X;
488 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
489
490 tmp_dst.writemask = WRITEMASK_Y;
491 emit(SHR(tmp_dst, src0, src_reg(16u)));
492
493 dst.writemask = WRITEMASK_XY;
494 emit(F16TO32(dst, tmp_src));
495 }
496
497 void
498 vec4_visitor::visit_instructions(const exec_list *list)
499 {
500 foreach_list(node, list) {
501 ir_instruction *ir = (ir_instruction *)node;
502
503 base_ir = ir;
504 ir->accept(this);
505 }
506 }
507
508
509 static int
510 type_size(const struct glsl_type *type)
511 {
512 unsigned int i;
513 int size;
514
515 switch (type->base_type) {
516 case GLSL_TYPE_UINT:
517 case GLSL_TYPE_INT:
518 case GLSL_TYPE_FLOAT:
519 case GLSL_TYPE_BOOL:
520 if (type->is_matrix()) {
521 return type->matrix_columns;
522 } else {
523 /* Regardless of size of vector, it gets a vec4. This is bad
524 * packing for things like floats, but otherwise arrays become a
525 * mess. Hopefully a later pass over the code can pack scalars
526 * down if appropriate.
527 */
528 return 1;
529 }
530 case GLSL_TYPE_ARRAY:
531 assert(type->length > 0);
532 return type_size(type->fields.array) * type->length;
533 case GLSL_TYPE_STRUCT:
534 size = 0;
535 for (i = 0; i < type->length; i++) {
536 size += type_size(type->fields.structure[i].type);
537 }
538 return size;
539 case GLSL_TYPE_SAMPLER:
540 /* Samplers take up one slot in UNIFORMS[], but they're baked in
541 * at link time.
542 */
543 return 1;
544 case GLSL_TYPE_VOID:
545 case GLSL_TYPE_ERROR:
546 case GLSL_TYPE_INTERFACE:
547 assert(0);
548 break;
549 }
550
551 return 0;
552 }
553
554 int
555 vec4_visitor::virtual_grf_alloc(int size)
556 {
557 if (virtual_grf_array_size <= virtual_grf_count) {
558 if (virtual_grf_array_size == 0)
559 virtual_grf_array_size = 16;
560 else
561 virtual_grf_array_size *= 2;
562 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
563 virtual_grf_array_size);
564 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
565 virtual_grf_array_size);
566 }
567 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
568 virtual_grf_reg_count += size;
569 virtual_grf_sizes[virtual_grf_count] = size;
570 return virtual_grf_count++;
571 }
572
573 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
574 {
575 init();
576
577 this->file = GRF;
578 this->reg = v->virtual_grf_alloc(type_size(type));
579
580 if (type->is_array() || type->is_record()) {
581 this->swizzle = BRW_SWIZZLE_NOOP;
582 } else {
583 this->swizzle = swizzle_for_size(type->vector_elements);
584 }
585
586 this->type = brw_type_for_base_type(type);
587 }
588
589 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
590 {
591 init();
592
593 this->file = GRF;
594 this->reg = v->virtual_grf_alloc(type_size(type));
595
596 if (type->is_array() || type->is_record()) {
597 this->writemask = WRITEMASK_XYZW;
598 } else {
599 this->writemask = (1 << type->vector_elements) - 1;
600 }
601
602 this->type = brw_type_for_base_type(type);
603 }
604
605 /* Our support for uniforms is piggy-backed on the struct
606 * gl_fragment_program, because that's where the values actually
607 * get stored, rather than in some global gl_shader_program uniform
608 * store.
609 */
610 void
611 vec4_visitor::setup_uniform_values(ir_variable *ir)
612 {
613 int namelen = strlen(ir->name);
614
615 /* The data for our (non-builtin) uniforms is stored in a series of
616 * gl_uniform_driver_storage structs for each subcomponent that
617 * glGetUniformLocation() could name. We know it's been set up in the same
618 * order we'd walk the type, so walk the list of storage and find anything
619 * with our name, or the prefix of a component that starts with our name.
620 */
621 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
622 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
623
624 if (strncmp(ir->name, storage->name, namelen) != 0 ||
625 (storage->name[namelen] != 0 &&
626 storage->name[namelen] != '.' &&
627 storage->name[namelen] != '[')) {
628 continue;
629 }
630
631 gl_constant_value *components = storage->storage;
632 unsigned vector_count = (MAX2(storage->array_elements, 1) *
633 storage->type->matrix_columns);
634
635 for (unsigned s = 0; s < vector_count; s++) {
636 uniform_vector_size[uniforms] = storage->type->vector_elements;
637
638 int i;
639 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
640 prog_data->param[uniforms * 4 + i] = &components->f;
641 components++;
642 }
643 for (; i < 4; i++) {
644 static float zero = 0;
645 prog_data->param[uniforms * 4 + i] = &zero;
646 }
647
648 uniforms++;
649 }
650 }
651 }
652
653 void
654 vec4_visitor::setup_uniform_clipplane_values()
655 {
656 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
657
658 if (intel->gen < 6) {
659 /* Pre-Gen6, we compact clip planes. For example, if the user
660 * enables just clip planes 0, 1, and 3, we will enable clip planes
661 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
662 * plane 2. This simplifies the implementation of the Gen6 clip
663 * thread.
664 */
665 int compacted_clipplane_index = 0;
666 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
667 if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
668 continue;
669
670 this->uniform_vector_size[this->uniforms] = 4;
671 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
672 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
673 for (int j = 0; j < 4; ++j) {
674 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
675 }
676 ++compacted_clipplane_index;
677 ++this->uniforms;
678 }
679 } else {
680 /* In Gen6 and later, we don't compact clip planes, because this
681 * simplifies the implementation of gl_ClipDistance.
682 */
683 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
684 this->uniform_vector_size[this->uniforms] = 4;
685 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
686 this->userplane[i].type = BRW_REGISTER_TYPE_F;
687 for (int j = 0; j < 4; ++j) {
688 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
689 }
690 ++this->uniforms;
691 }
692 }
693 }
694
695 /* Our support for builtin uniforms is even scarier than non-builtin.
696 * It sits on top of the PROG_STATE_VAR parameters that are
697 * automatically updated from GL context state.
698 */
699 void
700 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
701 {
702 const ir_state_slot *const slots = ir->state_slots;
703 assert(ir->state_slots != NULL);
704
705 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
706 /* This state reference has already been setup by ir_to_mesa,
707 * but we'll get the same index back here. We can reference
708 * ParameterValues directly, since unlike brw_fs.cpp, we never
709 * add new state references during compile.
710 */
711 int index = _mesa_add_state_reference(this->prog->Parameters,
712 (gl_state_index *)slots[i].tokens);
713 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
714
715 this->uniform_vector_size[this->uniforms] = 0;
716 /* Add each of the unique swizzled channels of the element.
717 * This will end up matching the size of the glsl_type of this field.
718 */
719 int last_swiz = -1;
720 for (unsigned int j = 0; j < 4; j++) {
721 int swiz = GET_SWZ(slots[i].swizzle, j);
722 last_swiz = swiz;
723
724 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
725 if (swiz <= last_swiz)
726 this->uniform_vector_size[this->uniforms]++;
727 }
728 this->uniforms++;
729 }
730 }
731
732 dst_reg *
733 vec4_visitor::variable_storage(ir_variable *var)
734 {
735 return (dst_reg *)hash_table_find(this->variable_ht, var);
736 }
737
738 void
739 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
740 {
741 ir_expression *expr = ir->as_expression();
742
743 *predicate = BRW_PREDICATE_NORMAL;
744
745 if (expr) {
746 src_reg op[2];
747 vec4_instruction *inst;
748
749 assert(expr->get_num_operands() <= 2);
750 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
751 expr->operands[i]->accept(this);
752 op[i] = this->result;
753
754 resolve_ud_negate(&op[i]);
755 }
756
757 switch (expr->operation) {
758 case ir_unop_logic_not:
759 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
760 inst->conditional_mod = BRW_CONDITIONAL_Z;
761 break;
762
763 case ir_binop_logic_xor:
764 inst = emit(XOR(dst_null_d(), op[0], op[1]));
765 inst->conditional_mod = BRW_CONDITIONAL_NZ;
766 break;
767
768 case ir_binop_logic_or:
769 inst = emit(OR(dst_null_d(), op[0], op[1]));
770 inst->conditional_mod = BRW_CONDITIONAL_NZ;
771 break;
772
773 case ir_binop_logic_and:
774 inst = emit(AND(dst_null_d(), op[0], op[1]));
775 inst->conditional_mod = BRW_CONDITIONAL_NZ;
776 break;
777
778 case ir_unop_f2b:
779 if (intel->gen >= 6) {
780 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
781 } else {
782 inst = emit(MOV(dst_null_f(), op[0]));
783 inst->conditional_mod = BRW_CONDITIONAL_NZ;
784 }
785 break;
786
787 case ir_unop_i2b:
788 if (intel->gen >= 6) {
789 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
790 } else {
791 inst = emit(MOV(dst_null_d(), op[0]));
792 inst->conditional_mod = BRW_CONDITIONAL_NZ;
793 }
794 break;
795
796 case ir_binop_all_equal:
797 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
798 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
799 break;
800
801 case ir_binop_any_nequal:
802 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
803 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
804 break;
805
806 case ir_unop_any:
807 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
808 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
809 break;
810
811 case ir_binop_greater:
812 case ir_binop_gequal:
813 case ir_binop_less:
814 case ir_binop_lequal:
815 case ir_binop_equal:
816 case ir_binop_nequal:
817 emit(CMP(dst_null_d(), op[0], op[1],
818 brw_conditional_for_comparison(expr->operation)));
819 break;
820
821 default:
822 assert(!"not reached");
823 break;
824 }
825 return;
826 }
827
828 ir->accept(this);
829
830 resolve_ud_negate(&this->result);
831
832 if (intel->gen >= 6) {
833 vec4_instruction *inst = emit(AND(dst_null_d(),
834 this->result, src_reg(1)));
835 inst->conditional_mod = BRW_CONDITIONAL_NZ;
836 } else {
837 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 }
840 }
841
842 /**
843 * Emit a gen6 IF statement with the comparison folded into the IF
844 * instruction.
845 */
846 void
847 vec4_visitor::emit_if_gen6(ir_if *ir)
848 {
849 ir_expression *expr = ir->condition->as_expression();
850
851 if (expr) {
852 src_reg op[2];
853 dst_reg temp;
854
855 assert(expr->get_num_operands() <= 2);
856 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
857 expr->operands[i]->accept(this);
858 op[i] = this->result;
859 }
860
861 switch (expr->operation) {
862 case ir_unop_logic_not:
863 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
864 return;
865
866 case ir_binop_logic_xor:
867 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
868 return;
869
870 case ir_binop_logic_or:
871 temp = dst_reg(this, glsl_type::bool_type);
872 emit(OR(temp, op[0], op[1]));
873 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
874 return;
875
876 case ir_binop_logic_and:
877 temp = dst_reg(this, glsl_type::bool_type);
878 emit(AND(temp, op[0], op[1]));
879 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
880 return;
881
882 case ir_unop_f2b:
883 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
884 return;
885
886 case ir_unop_i2b:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
888 return;
889
890 case ir_binop_greater:
891 case ir_binop_gequal:
892 case ir_binop_less:
893 case ir_binop_lequal:
894 case ir_binop_equal:
895 case ir_binop_nequal:
896 emit(IF(op[0], op[1],
897 brw_conditional_for_comparison(expr->operation)));
898 return;
899
900 case ir_binop_all_equal:
901 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
902 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
903 return;
904
905 case ir_binop_any_nequal:
906 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
907 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
908 return;
909
910 case ir_unop_any:
911 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
912 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
913 return;
914
915 default:
916 assert(!"not reached");
917 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
918 return;
919 }
920 return;
921 }
922
923 ir->condition->accept(this);
924
925 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
926 }
927
928 static dst_reg
929 with_writemask(dst_reg const & r, int mask)
930 {
931 dst_reg result = r;
932 result.writemask = mask;
933 return result;
934 }
935
936 void
937 vec4_vs_visitor::emit_prolog()
938 {
939 dst_reg sign_recovery_shift;
940 dst_reg normalize_factor;
941 dst_reg es3_normalize_factor;
942
943 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
944 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
945 uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
946 dst_reg reg(ATTR, i);
947 dst_reg reg_d = reg;
948 reg_d.type = BRW_REGISTER_TYPE_D;
949 dst_reg reg_ud = reg;
950 reg_ud.type = BRW_REGISTER_TYPE_UD;
951
952 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
953 * come in as floating point conversions of the integer values.
954 */
955 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
956 dst_reg dst = reg;
957 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
958 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
959 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
960 }
961
962 /* Do sign recovery for 2101010 formats if required. */
963 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
964 if (sign_recovery_shift.file == BAD_FILE) {
965 /* shift constant: <22,22,22,30> */
966 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
967 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
968 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
969 }
970
971 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
972 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
973 }
974
975 /* Apply BGRA swizzle if required. */
976 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
977 src_reg temp = src_reg(reg);
978 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
979 emit(MOV(reg, temp));
980 }
981
982 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
983 /* ES 3.0 has different rules for converting signed normalized
984 * fixed-point numbers than desktop GL.
985 */
986 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
987 /* According to equation 2.2 of the ES 3.0 specification,
988 * signed normalization conversion is done by:
989 *
990 * f = c / (2^(b-1)-1)
991 */
992 if (es3_normalize_factor.file == BAD_FILE) {
993 /* mul constant: 1 / (2^(b-1) - 1) */
994 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
995 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
996 src_reg(1.0f / ((1<<9) - 1))));
997 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
998 src_reg(1.0f / ((1<<1) - 1))));
999 }
1000
1001 dst_reg dst = reg;
1002 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1003 emit(MOV(dst, src_reg(reg_d)));
1004 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
1005 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
1006 } else {
1007 /* The following equations are from the OpenGL 3.2 specification:
1008 *
1009 * 2.1 unsigned normalization
1010 * f = c/(2^n-1)
1011 *
1012 * 2.2 signed normalization
1013 * f = (2c+1)/(2^n-1)
1014 *
1015 * Both of these share a common divisor, which is represented by
1016 * "normalize_factor" in the code below.
1017 */
1018 if (normalize_factor.file == BAD_FILE) {
1019 /* 1 / (2^b - 1) for b=<10,10,10,2> */
1020 normalize_factor = dst_reg(this, glsl_type::vec4_type);
1021 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1022 src_reg(1.0f / ((1<<10) - 1))));
1023 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1024 src_reg(1.0f / ((1<<2) - 1))));
1025 }
1026
1027 dst_reg dst = reg;
1028 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1029 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1030
1031 /* For signed normalization, we want the numerator to be 2c+1. */
1032 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1033 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1034 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1035 }
1036
1037 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1038 }
1039 }
1040
1041 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1042 dst_reg dst = reg;
1043 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1044 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1045 }
1046 }
1047 }
1048 }
1049
1050
1051 dst_reg *
1052 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1053 {
1054 /* VertexID is stored by the VF as the last vertex element, but
1055 * we don't represent it with a flag in inputs_read, so we call
1056 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1057 */
1058 dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1059 vs_prog_data->uses_vertexid = true;
1060
1061 switch (ir->location) {
1062 case SYSTEM_VALUE_VERTEX_ID:
1063 reg->writemask = WRITEMASK_X;
1064 break;
1065 case SYSTEM_VALUE_INSTANCE_ID:
1066 reg->writemask = WRITEMASK_Y;
1067 break;
1068 default:
1069 assert(!"not reached");
1070 break;
1071 }
1072
1073 return reg;
1074 }
1075
1076
1077 void
1078 vec4_visitor::visit(ir_variable *ir)
1079 {
1080 dst_reg *reg = NULL;
1081
1082 if (variable_storage(ir))
1083 return;
1084
1085 switch (ir->mode) {
1086 case ir_var_shader_in:
1087 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1088 break;
1089
1090 case ir_var_shader_out:
1091 reg = new(mem_ctx) dst_reg(this, ir->type);
1092
1093 for (int i = 0; i < type_size(ir->type); i++) {
1094 output_reg[ir->location + i] = *reg;
1095 output_reg[ir->location + i].reg_offset = i;
1096 output_reg[ir->location + i].type =
1097 brw_type_for_base_type(ir->type->get_scalar_type());
1098 output_reg_annotation[ir->location + i] = ir->name;
1099 }
1100 break;
1101
1102 case ir_var_auto:
1103 case ir_var_temporary:
1104 reg = new(mem_ctx) dst_reg(this, ir->type);
1105 break;
1106
1107 case ir_var_uniform:
1108 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1109
1110 /* Thanks to the lower_ubo_reference pass, we will see only
1111 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1112 * variables, so no need for them to be in variable_ht.
1113 */
1114 if (ir->is_in_uniform_block())
1115 return;
1116
1117 /* Track how big the whole uniform variable is, in case we need to put a
1118 * copy of its data into pull constants for array access.
1119 */
1120 this->uniform_size[this->uniforms] = type_size(ir->type);
1121
1122 if (!strncmp(ir->name, "gl_", 3)) {
1123 setup_builtin_uniform_values(ir);
1124 } else {
1125 setup_uniform_values(ir);
1126 }
1127 break;
1128
1129 case ir_var_system_value:
1130 reg = make_reg_for_system_value(ir);
1131 break;
1132
1133 default:
1134 assert(!"not reached");
1135 }
1136
1137 reg->type = brw_type_for_base_type(ir->type);
1138 hash_table_insert(this->variable_ht, reg, ir);
1139 }
1140
1141 void
1142 vec4_visitor::visit(ir_loop *ir)
1143 {
1144 dst_reg counter;
1145
1146 /* We don't want debugging output to print the whole body of the
1147 * loop as the annotation.
1148 */
1149 this->base_ir = NULL;
1150
1151 if (ir->counter != NULL) {
1152 this->base_ir = ir->counter;
1153 ir->counter->accept(this);
1154 counter = *(variable_storage(ir->counter));
1155
1156 if (ir->from != NULL) {
1157 this->base_ir = ir->from;
1158 ir->from->accept(this);
1159
1160 emit(MOV(counter, this->result));
1161 }
1162 }
1163
1164 emit(BRW_OPCODE_DO);
1165
1166 if (ir->to) {
1167 this->base_ir = ir->to;
1168 ir->to->accept(this);
1169
1170 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1171 brw_conditional_for_comparison(ir->cmp)));
1172
1173 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1174 inst->predicate = BRW_PREDICATE_NORMAL;
1175 }
1176
1177 visit_instructions(&ir->body_instructions);
1178
1179
1180 if (ir->increment) {
1181 this->base_ir = ir->increment;
1182 ir->increment->accept(this);
1183 emit(ADD(counter, src_reg(counter), this->result));
1184 }
1185
1186 emit(BRW_OPCODE_WHILE);
1187 }
1188
1189 void
1190 vec4_visitor::visit(ir_loop_jump *ir)
1191 {
1192 switch (ir->mode) {
1193 case ir_loop_jump::jump_break:
1194 emit(BRW_OPCODE_BREAK);
1195 break;
1196 case ir_loop_jump::jump_continue:
1197 emit(BRW_OPCODE_CONTINUE);
1198 break;
1199 }
1200 }
1201
1202
1203 void
1204 vec4_visitor::visit(ir_function_signature *ir)
1205 {
1206 assert(0);
1207 (void)ir;
1208 }
1209
1210 void
1211 vec4_visitor::visit(ir_function *ir)
1212 {
1213 /* Ignore function bodies other than main() -- we shouldn't see calls to
1214 * them since they should all be inlined.
1215 */
1216 if (strcmp(ir->name, "main") == 0) {
1217 const ir_function_signature *sig;
1218 exec_list empty;
1219
1220 sig = ir->matching_signature(&empty);
1221
1222 assert(sig);
1223
1224 visit_instructions(&sig->body);
1225 }
1226 }
1227
1228 bool
1229 vec4_visitor::try_emit_sat(ir_expression *ir)
1230 {
1231 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1232 if (!sat_src)
1233 return false;
1234
1235 sat_src->accept(this);
1236 src_reg src = this->result;
1237
1238 this->result = src_reg(this, ir->type);
1239 vec4_instruction *inst;
1240 inst = emit(MOV(dst_reg(this->result), src));
1241 inst->saturate = true;
1242
1243 return true;
1244 }
1245
1246 void
1247 vec4_visitor::emit_bool_comparison(unsigned int op,
1248 dst_reg dst, src_reg src0, src_reg src1)
1249 {
1250 /* original gen4 does destination conversion before comparison. */
1251 if (intel->gen < 5)
1252 dst.type = src0.type;
1253
1254 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1255
1256 dst.type = BRW_REGISTER_TYPE_D;
1257 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1258 }
1259
1260 void
1261 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1262 src_reg src0, src_reg src1)
1263 {
1264 vec4_instruction *inst;
1265
1266 if (intel->gen >= 6) {
1267 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268 inst->conditional_mod = conditionalmod;
1269 } else {
1270 emit(CMP(dst, src0, src1, conditionalmod));
1271
1272 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1273 inst->predicate = BRW_PREDICATE_NORMAL;
1274 }
1275 }
1276
1277 void
1278 vec4_visitor::visit(ir_expression *ir)
1279 {
1280 unsigned int operand;
1281 src_reg op[Elements(ir->operands)];
1282 src_reg result_src;
1283 dst_reg result_dst;
1284 vec4_instruction *inst;
1285
1286 if (try_emit_sat(ir))
1287 return;
1288
1289 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1290 this->result.file = BAD_FILE;
1291 ir->operands[operand]->accept(this);
1292 if (this->result.file == BAD_FILE) {
1293 printf("Failed to get tree for expression operand:\n");
1294 ir->operands[operand]->print();
1295 exit(1);
1296 }
1297 op[operand] = this->result;
1298
1299 /* Matrix expression operands should have been broken down to vector
1300 * operations already.
1301 */
1302 assert(!ir->operands[operand]->type->is_matrix());
1303 }
1304
1305 int vector_elements = ir->operands[0]->type->vector_elements;
1306 if (ir->operands[1]) {
1307 vector_elements = MAX2(vector_elements,
1308 ir->operands[1]->type->vector_elements);
1309 }
1310
1311 this->result.file = BAD_FILE;
1312
1313 /* Storage for our result. Ideally for an assignment we'd be using
1314 * the actual storage for the result here, instead.
1315 */
1316 result_src = src_reg(this, ir->type);
1317 /* convenience for the emit functions below. */
1318 result_dst = dst_reg(result_src);
1319 /* If nothing special happens, this is the result. */
1320 this->result = result_src;
1321 /* Limit writes to the channels that will be used by result_src later.
1322 * This does limit this temp's use as a temporary for multi-instruction
1323 * sequences.
1324 */
1325 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1326
1327 switch (ir->operation) {
1328 case ir_unop_logic_not:
1329 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1330 * ones complement of the whole register, not just bit 0.
1331 */
1332 emit(XOR(result_dst, op[0], src_reg(1)));
1333 break;
1334 case ir_unop_neg:
1335 op[0].negate = !op[0].negate;
1336 this->result = op[0];
1337 break;
1338 case ir_unop_abs:
1339 op[0].abs = true;
1340 op[0].negate = false;
1341 this->result = op[0];
1342 break;
1343
1344 case ir_unop_sign:
1345 emit(MOV(result_dst, src_reg(0.0f)));
1346
1347 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1348 inst = emit(MOV(result_dst, src_reg(1.0f)));
1349 inst->predicate = BRW_PREDICATE_NORMAL;
1350
1351 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1352 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1353 inst->predicate = BRW_PREDICATE_NORMAL;
1354
1355 break;
1356
1357 case ir_unop_rcp:
1358 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1359 break;
1360
1361 case ir_unop_exp2:
1362 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1363 break;
1364 case ir_unop_log2:
1365 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1366 break;
1367 case ir_unop_exp:
1368 case ir_unop_log:
1369 assert(!"not reached: should be handled by ir_explog_to_explog2");
1370 break;
1371 case ir_unop_sin:
1372 case ir_unop_sin_reduced:
1373 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1374 break;
1375 case ir_unop_cos:
1376 case ir_unop_cos_reduced:
1377 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1378 break;
1379
1380 case ir_unop_dFdx:
1381 case ir_unop_dFdy:
1382 assert(!"derivatives not valid in vertex shader");
1383 break;
1384
1385 case ir_unop_noise:
1386 assert(!"not reached: should be handled by lower_noise");
1387 break;
1388
1389 case ir_binop_add:
1390 emit(ADD(result_dst, op[0], op[1]));
1391 break;
1392 case ir_binop_sub:
1393 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1394 break;
1395
1396 case ir_binop_mul:
1397 if (ir->type->is_integer()) {
1398 /* For integer multiplication, the MUL uses the low 16 bits
1399 * of one of the operands (src0 on gen6, src1 on gen7). The
1400 * MACH accumulates in the contribution of the upper 16 bits
1401 * of that operand.
1402 *
1403 * FINISHME: Emit just the MUL if we know an operand is small
1404 * enough.
1405 */
1406 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1407
1408 emit(MUL(acc, op[0], op[1]));
1409 emit(MACH(dst_null_d(), op[0], op[1]));
1410 emit(MOV(result_dst, src_reg(acc)));
1411 } else {
1412 emit(MUL(result_dst, op[0], op[1]));
1413 }
1414 break;
1415 case ir_binop_div:
1416 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1417 assert(ir->type->is_integer());
1418 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1419 break;
1420 case ir_binop_mod:
1421 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1422 assert(ir->type->is_integer());
1423 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1424 break;
1425
1426 case ir_binop_less:
1427 case ir_binop_greater:
1428 case ir_binop_lequal:
1429 case ir_binop_gequal:
1430 case ir_binop_equal:
1431 case ir_binop_nequal: {
1432 emit(CMP(result_dst, op[0], op[1],
1433 brw_conditional_for_comparison(ir->operation)));
1434 emit(AND(result_dst, result_src, src_reg(0x1)));
1435 break;
1436 }
1437
1438 case ir_binop_all_equal:
1439 /* "==" operator producing a scalar boolean. */
1440 if (ir->operands[0]->type->is_vector() ||
1441 ir->operands[1]->type->is_vector()) {
1442 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1443 emit(MOV(result_dst, src_reg(0)));
1444 inst = emit(MOV(result_dst, src_reg(1)));
1445 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1446 } else {
1447 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1448 emit(AND(result_dst, result_src, src_reg(0x1)));
1449 }
1450 break;
1451 case ir_binop_any_nequal:
1452 /* "!=" operator producing a scalar boolean. */
1453 if (ir->operands[0]->type->is_vector() ||
1454 ir->operands[1]->type->is_vector()) {
1455 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1456
1457 emit(MOV(result_dst, src_reg(0)));
1458 inst = emit(MOV(result_dst, src_reg(1)));
1459 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1460 } else {
1461 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1462 emit(AND(result_dst, result_src, src_reg(0x1)));
1463 }
1464 break;
1465
1466 case ir_unop_any:
1467 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1468 emit(MOV(result_dst, src_reg(0)));
1469
1470 inst = emit(MOV(result_dst, src_reg(1)));
1471 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1472 break;
1473
1474 case ir_binop_logic_xor:
1475 emit(XOR(result_dst, op[0], op[1]));
1476 break;
1477
1478 case ir_binop_logic_or:
1479 emit(OR(result_dst, op[0], op[1]));
1480 break;
1481
1482 case ir_binop_logic_and:
1483 emit(AND(result_dst, op[0], op[1]));
1484 break;
1485
1486 case ir_binop_dot:
1487 assert(ir->operands[0]->type->is_vector());
1488 assert(ir->operands[0]->type == ir->operands[1]->type);
1489 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1490 break;
1491
1492 case ir_unop_sqrt:
1493 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1494 break;
1495 case ir_unop_rsq:
1496 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1497 break;
1498
1499 case ir_unop_bitcast_i2f:
1500 case ir_unop_bitcast_u2f:
1501 this->result = op[0];
1502 this->result.type = BRW_REGISTER_TYPE_F;
1503 break;
1504
1505 case ir_unop_bitcast_f2i:
1506 this->result = op[0];
1507 this->result.type = BRW_REGISTER_TYPE_D;
1508 break;
1509
1510 case ir_unop_bitcast_f2u:
1511 this->result = op[0];
1512 this->result.type = BRW_REGISTER_TYPE_UD;
1513 break;
1514
1515 case ir_unop_i2f:
1516 case ir_unop_i2u:
1517 case ir_unop_u2i:
1518 case ir_unop_u2f:
1519 case ir_unop_b2f:
1520 case ir_unop_b2i:
1521 case ir_unop_f2i:
1522 case ir_unop_f2u:
1523 emit(MOV(result_dst, op[0]));
1524 break;
1525 case ir_unop_f2b:
1526 case ir_unop_i2b: {
1527 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1528 emit(AND(result_dst, result_src, src_reg(1)));
1529 break;
1530 }
1531
1532 case ir_unop_trunc:
1533 emit(RNDZ(result_dst, op[0]));
1534 break;
1535 case ir_unop_ceil:
1536 op[0].negate = !op[0].negate;
1537 inst = emit(RNDD(result_dst, op[0]));
1538 this->result.negate = true;
1539 break;
1540 case ir_unop_floor:
1541 inst = emit(RNDD(result_dst, op[0]));
1542 break;
1543 case ir_unop_fract:
1544 inst = emit(FRC(result_dst, op[0]));
1545 break;
1546 case ir_unop_round_even:
1547 emit(RNDE(result_dst, op[0]));
1548 break;
1549
1550 case ir_binop_min:
1551 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1552 break;
1553 case ir_binop_max:
1554 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1555 break;
1556
1557 case ir_binop_pow:
1558 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1559 break;
1560
1561 case ir_unop_bit_not:
1562 inst = emit(NOT(result_dst, op[0]));
1563 break;
1564 case ir_binop_bit_and:
1565 inst = emit(AND(result_dst, op[0], op[1]));
1566 break;
1567 case ir_binop_bit_xor:
1568 inst = emit(XOR(result_dst, op[0], op[1]));
1569 break;
1570 case ir_binop_bit_or:
1571 inst = emit(OR(result_dst, op[0], op[1]));
1572 break;
1573
1574 case ir_binop_lshift:
1575 inst = emit(SHL(result_dst, op[0], op[1]));
1576 break;
1577
1578 case ir_binop_rshift:
1579 if (ir->type->base_type == GLSL_TYPE_INT)
1580 inst = emit(ASR(result_dst, op[0], op[1]));
1581 else
1582 inst = emit(SHR(result_dst, op[0], op[1]));
1583 break;
1584
1585 case ir_binop_ubo_load: {
1586 ir_constant *uniform_block = ir->operands[0]->as_constant();
1587 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1588 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1589 src_reg offset = op[1];
1590
1591 /* Now, load the vector from that offset. */
1592 assert(ir->type->is_vector() || ir->type->is_scalar());
1593
1594 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1595 packed_consts.type = result.type;
1596 src_reg surf_index =
1597 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1598 if (const_offset_ir) {
1599 offset = src_reg(const_offset / 16);
1600 } else {
1601 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1602 }
1603
1604 vec4_instruction *pull =
1605 emit(new(mem_ctx) vec4_instruction(this,
1606 VS_OPCODE_PULL_CONSTANT_LOAD,
1607 dst_reg(packed_consts),
1608 surf_index,
1609 offset));
1610 pull->base_mrf = 14;
1611 pull->mlen = 1;
1612
1613 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1614 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1615 const_offset % 16 / 4,
1616 const_offset % 16 / 4,
1617 const_offset % 16 / 4);
1618
1619 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1620 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1621 emit(CMP(result_dst, packed_consts, src_reg(0u),
1622 BRW_CONDITIONAL_NZ));
1623 emit(AND(result_dst, result, src_reg(0x1)));
1624 } else {
1625 emit(MOV(result_dst, packed_consts));
1626 }
1627 break;
1628 }
1629
1630 case ir_triop_lrp:
1631 op[0] = fix_3src_operand(op[0]);
1632 op[1] = fix_3src_operand(op[1]);
1633 op[2] = fix_3src_operand(op[2]);
1634 emit(LRP(result_dst, op[0], op[1], op[2]));
1635 break;
1636
1637 case ir_quadop_vector:
1638 assert(!"not reached: should be handled by lower_quadop_vector");
1639 break;
1640
1641 case ir_unop_pack_half_2x16:
1642 emit_pack_half_2x16(result_dst, op[0]);
1643 break;
1644 case ir_unop_unpack_half_2x16:
1645 emit_unpack_half_2x16(result_dst, op[0]);
1646 break;
1647 case ir_unop_pack_snorm_2x16:
1648 case ir_unop_pack_snorm_4x8:
1649 case ir_unop_pack_unorm_2x16:
1650 case ir_unop_pack_unorm_4x8:
1651 case ir_unop_unpack_snorm_2x16:
1652 case ir_unop_unpack_snorm_4x8:
1653 case ir_unop_unpack_unorm_2x16:
1654 case ir_unop_unpack_unorm_4x8:
1655 assert(!"not reached: should be handled by lower_packing_builtins");
1656 break;
1657 case ir_unop_unpack_half_2x16_split_x:
1658 case ir_unop_unpack_half_2x16_split_y:
1659 case ir_binop_pack_half_2x16_split:
1660 assert(!"not reached: should not occur in vertex shader");
1661 break;
1662 }
1663 }
1664
1665
1666 void
1667 vec4_visitor::visit(ir_swizzle *ir)
1668 {
1669 src_reg src;
1670 int i = 0;
1671 int swizzle[4];
1672
1673 /* Note that this is only swizzles in expressions, not those on the left
1674 * hand side of an assignment, which do write masking. See ir_assignment
1675 * for that.
1676 */
1677
1678 ir->val->accept(this);
1679 src = this->result;
1680 assert(src.file != BAD_FILE);
1681
1682 for (i = 0; i < ir->type->vector_elements; i++) {
1683 switch (i) {
1684 case 0:
1685 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1686 break;
1687 case 1:
1688 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1689 break;
1690 case 2:
1691 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1692 break;
1693 case 3:
1694 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1695 break;
1696 }
1697 }
1698 for (; i < 4; i++) {
1699 /* Replicate the last channel out. */
1700 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1701 }
1702
1703 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1704
1705 this->result = src;
1706 }
1707
1708 void
1709 vec4_visitor::visit(ir_dereference_variable *ir)
1710 {
1711 const struct glsl_type *type = ir->type;
1712 dst_reg *reg = variable_storage(ir->var);
1713
1714 if (!reg) {
1715 fail("Failed to find variable storage for %s\n", ir->var->name);
1716 this->result = src_reg(brw_null_reg());
1717 return;
1718 }
1719
1720 this->result = src_reg(*reg);
1721
1722 /* System values get their swizzle from the dst_reg writemask */
1723 if (ir->var->mode == ir_var_system_value)
1724 return;
1725
1726 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1727 this->result.swizzle = swizzle_for_size(type->vector_elements);
1728 }
1729
1730
1731 int
1732 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1733 {
1734 /* Under normal circumstances array elements are stored consecutively, so
1735 * the stride is equal to the size of the array element.
1736 */
1737 return type_size(ir->type);
1738 }
1739
1740
1741 void
1742 vec4_visitor::visit(ir_dereference_array *ir)
1743 {
1744 ir_constant *constant_index;
1745 src_reg src;
1746 int array_stride = compute_array_stride(ir);
1747
1748 constant_index = ir->array_index->constant_expression_value();
1749
1750 ir->array->accept(this);
1751 src = this->result;
1752
1753 if (constant_index) {
1754 src.reg_offset += constant_index->value.i[0] * array_stride;
1755 } else {
1756 /* Variable index array dereference. It eats the "vec4" of the
1757 * base of the array and an index that offsets the Mesa register
1758 * index.
1759 */
1760 ir->array_index->accept(this);
1761
1762 src_reg index_reg;
1763
1764 if (array_stride == 1) {
1765 index_reg = this->result;
1766 } else {
1767 index_reg = src_reg(this, glsl_type::int_type);
1768
1769 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1770 }
1771
1772 if (src.reladdr) {
1773 src_reg temp = src_reg(this, glsl_type::int_type);
1774
1775 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1776
1777 index_reg = temp;
1778 }
1779
1780 src.reladdr = ralloc(mem_ctx, src_reg);
1781 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1782 }
1783
1784 /* If the type is smaller than a vec4, replicate the last channel out. */
1785 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1786 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1787 else
1788 src.swizzle = BRW_SWIZZLE_NOOP;
1789 src.type = brw_type_for_base_type(ir->type);
1790
1791 this->result = src;
1792 }
1793
1794 void
1795 vec4_visitor::visit(ir_dereference_record *ir)
1796 {
1797 unsigned int i;
1798 const glsl_type *struct_type = ir->record->type;
1799 int offset = 0;
1800
1801 ir->record->accept(this);
1802
1803 for (i = 0; i < struct_type->length; i++) {
1804 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1805 break;
1806 offset += type_size(struct_type->fields.structure[i].type);
1807 }
1808
1809 /* If the type is smaller than a vec4, replicate the last channel out. */
1810 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1811 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1812 else
1813 this->result.swizzle = BRW_SWIZZLE_NOOP;
1814 this->result.type = brw_type_for_base_type(ir->type);
1815
1816 this->result.reg_offset += offset;
1817 }
1818
1819 /**
1820 * We want to be careful in assignment setup to hit the actual storage
1821 * instead of potentially using a temporary like we might with the
1822 * ir_dereference handler.
1823 */
1824 static dst_reg
1825 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1826 {
1827 /* The LHS must be a dereference. If the LHS is a variable indexed array
1828 * access of a vector, it must be separated into a series conditional moves
1829 * before reaching this point (see ir_vec_index_to_cond_assign).
1830 */
1831 assert(ir->as_dereference());
1832 ir_dereference_array *deref_array = ir->as_dereference_array();
1833 if (deref_array) {
1834 assert(!deref_array->array->type->is_vector());
1835 }
1836
1837 /* Use the rvalue deref handler for the most part. We'll ignore
1838 * swizzles in it and write swizzles using writemask, though.
1839 */
1840 ir->accept(v);
1841 return dst_reg(v->result);
1842 }
1843
1844 void
1845 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1846 const struct glsl_type *type, uint32_t predicate)
1847 {
1848 if (type->base_type == GLSL_TYPE_STRUCT) {
1849 for (unsigned int i = 0; i < type->length; i++) {
1850 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1851 }
1852 return;
1853 }
1854
1855 if (type->is_array()) {
1856 for (unsigned int i = 0; i < type->length; i++) {
1857 emit_block_move(dst, src, type->fields.array, predicate);
1858 }
1859 return;
1860 }
1861
1862 if (type->is_matrix()) {
1863 const struct glsl_type *vec_type;
1864
1865 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1866 type->vector_elements, 1);
1867
1868 for (int i = 0; i < type->matrix_columns; i++) {
1869 emit_block_move(dst, src, vec_type, predicate);
1870 }
1871 return;
1872 }
1873
1874 assert(type->is_scalar() || type->is_vector());
1875
1876 dst->type = brw_type_for_base_type(type);
1877 src->type = dst->type;
1878
1879 dst->writemask = (1 << type->vector_elements) - 1;
1880
1881 src->swizzle = swizzle_for_size(type->vector_elements);
1882
1883 vec4_instruction *inst = emit(MOV(*dst, *src));
1884 inst->predicate = predicate;
1885
1886 dst->reg_offset++;
1887 src->reg_offset++;
1888 }
1889
1890
1891 /* If the RHS processing resulted in an instruction generating a
1892 * temporary value, and it would be easy to rewrite the instruction to
1893 * generate its result right into the LHS instead, do so. This ends
1894 * up reliably removing instructions where it can be tricky to do so
1895 * later without real UD chain information.
1896 */
1897 bool
1898 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1899 dst_reg dst,
1900 src_reg src,
1901 vec4_instruction *pre_rhs_inst,
1902 vec4_instruction *last_rhs_inst)
1903 {
1904 /* This could be supported, but it would take more smarts. */
1905 if (ir->condition)
1906 return false;
1907
1908 if (pre_rhs_inst == last_rhs_inst)
1909 return false; /* No instructions generated to work with. */
1910
1911 /* Make sure the last instruction generated our source reg. */
1912 if (src.file != GRF ||
1913 src.file != last_rhs_inst->dst.file ||
1914 src.reg != last_rhs_inst->dst.reg ||
1915 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1916 src.reladdr ||
1917 src.abs ||
1918 src.negate ||
1919 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1920 return false;
1921
1922 /* Check that that last instruction fully initialized the channels
1923 * we want to use, in the order we want to use them. We could
1924 * potentially reswizzle the operands of many instructions so that
1925 * we could handle out of order channels, but don't yet.
1926 */
1927
1928 for (unsigned i = 0; i < 4; i++) {
1929 if (dst.writemask & (1 << i)) {
1930 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1931 return false;
1932
1933 if (BRW_GET_SWZ(src.swizzle, i) != i)
1934 return false;
1935 }
1936 }
1937
1938 /* Success! Rewrite the instruction. */
1939 last_rhs_inst->dst.file = dst.file;
1940 last_rhs_inst->dst.reg = dst.reg;
1941 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1942 last_rhs_inst->dst.reladdr = dst.reladdr;
1943 last_rhs_inst->dst.writemask &= dst.writemask;
1944
1945 return true;
1946 }
1947
1948 void
1949 vec4_visitor::visit(ir_assignment *ir)
1950 {
1951 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1952 uint32_t predicate = BRW_PREDICATE_NONE;
1953
1954 if (!ir->lhs->type->is_scalar() &&
1955 !ir->lhs->type->is_vector()) {
1956 ir->rhs->accept(this);
1957 src_reg src = this->result;
1958
1959 if (ir->condition) {
1960 emit_bool_to_cond_code(ir->condition, &predicate);
1961 }
1962
1963 /* emit_block_move doesn't account for swizzles in the source register.
1964 * This should be ok, since the source register is a structure or an
1965 * array, and those can't be swizzled. But double-check to be sure.
1966 */
1967 assert(src.swizzle ==
1968 (ir->rhs->type->is_matrix()
1969 ? swizzle_for_size(ir->rhs->type->vector_elements)
1970 : BRW_SWIZZLE_NOOP));
1971
1972 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1973 return;
1974 }
1975
1976 /* Now we're down to just a scalar/vector with writemasks. */
1977 int i;
1978
1979 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1980 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1981
1982 ir->rhs->accept(this);
1983
1984 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1985
1986 src_reg src = this->result;
1987
1988 int swizzles[4];
1989 int first_enabled_chan = 0;
1990 int src_chan = 0;
1991
1992 assert(ir->lhs->type->is_vector() ||
1993 ir->lhs->type->is_scalar());
1994 dst.writemask = ir->write_mask;
1995
1996 for (int i = 0; i < 4; i++) {
1997 if (dst.writemask & (1 << i)) {
1998 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1999 break;
2000 }
2001 }
2002
2003 /* Swizzle a small RHS vector into the channels being written.
2004 *
2005 * glsl ir treats write_mask as dictating how many channels are
2006 * present on the RHS while in our instructions we need to make
2007 * those channels appear in the slots of the vec4 they're written to.
2008 */
2009 for (int i = 0; i < 4; i++) {
2010 if (dst.writemask & (1 << i))
2011 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2012 else
2013 swizzles[i] = first_enabled_chan;
2014 }
2015 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2016 swizzles[2], swizzles[3]);
2017
2018 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2019 return;
2020 }
2021
2022 if (ir->condition) {
2023 emit_bool_to_cond_code(ir->condition, &predicate);
2024 }
2025
2026 for (i = 0; i < type_size(ir->lhs->type); i++) {
2027 vec4_instruction *inst = emit(MOV(dst, src));
2028 inst->predicate = predicate;
2029
2030 dst.reg_offset++;
2031 src.reg_offset++;
2032 }
2033 }
2034
2035 void
2036 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2037 {
2038 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2039 foreach_list(node, &ir->components) {
2040 ir_constant *field_value = (ir_constant *)node;
2041
2042 emit_constant_values(dst, field_value);
2043 }
2044 return;
2045 }
2046
2047 if (ir->type->is_array()) {
2048 for (unsigned int i = 0; i < ir->type->length; i++) {
2049 emit_constant_values(dst, ir->array_elements[i]);
2050 }
2051 return;
2052 }
2053
2054 if (ir->type->is_matrix()) {
2055 for (int i = 0; i < ir->type->matrix_columns; i++) {
2056 float *vec = &ir->value.f[i * ir->type->vector_elements];
2057
2058 for (int j = 0; j < ir->type->vector_elements; j++) {
2059 dst->writemask = 1 << j;
2060 dst->type = BRW_REGISTER_TYPE_F;
2061
2062 emit(MOV(*dst, src_reg(vec[j])));
2063 }
2064 dst->reg_offset++;
2065 }
2066 return;
2067 }
2068
2069 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2070
2071 for (int i = 0; i < ir->type->vector_elements; i++) {
2072 if (!(remaining_writemask & (1 << i)))
2073 continue;
2074
2075 dst->writemask = 1 << i;
2076 dst->type = brw_type_for_base_type(ir->type);
2077
2078 /* Find other components that match the one we're about to
2079 * write. Emits fewer instructions for things like vec4(0.5,
2080 * 1.5, 1.5, 1.5).
2081 */
2082 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2083 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2084 if (ir->value.b[i] == ir->value.b[j])
2085 dst->writemask |= (1 << j);
2086 } else {
2087 /* u, i, and f storage all line up, so no need for a
2088 * switch case for comparing each type.
2089 */
2090 if (ir->value.u[i] == ir->value.u[j])
2091 dst->writemask |= (1 << j);
2092 }
2093 }
2094
2095 switch (ir->type->base_type) {
2096 case GLSL_TYPE_FLOAT:
2097 emit(MOV(*dst, src_reg(ir->value.f[i])));
2098 break;
2099 case GLSL_TYPE_INT:
2100 emit(MOV(*dst, src_reg(ir->value.i[i])));
2101 break;
2102 case GLSL_TYPE_UINT:
2103 emit(MOV(*dst, src_reg(ir->value.u[i])));
2104 break;
2105 case GLSL_TYPE_BOOL:
2106 emit(MOV(*dst, src_reg(ir->value.b[i])));
2107 break;
2108 default:
2109 assert(!"Non-float/uint/int/bool constant");
2110 break;
2111 }
2112
2113 remaining_writemask &= ~dst->writemask;
2114 }
2115 dst->reg_offset++;
2116 }
2117
2118 void
2119 vec4_visitor::visit(ir_constant *ir)
2120 {
2121 dst_reg dst = dst_reg(this, ir->type);
2122 this->result = src_reg(dst);
2123
2124 emit_constant_values(&dst, ir);
2125 }
2126
2127 void
2128 vec4_visitor::visit(ir_call *ir)
2129 {
2130 assert(!"not reached");
2131 }
2132
2133 void
2134 vec4_visitor::visit(ir_texture *ir)
2135 {
2136 int sampler =
2137 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2138
2139 /* Should be lowered by do_lower_texture_projection */
2140 assert(!ir->projector);
2141
2142 /* Generate code to compute all the subexpression trees. This has to be
2143 * done before loading any values into MRFs for the sampler message since
2144 * generating these values may involve SEND messages that need the MRFs.
2145 */
2146 src_reg coordinate;
2147 if (ir->coordinate) {
2148 ir->coordinate->accept(this);
2149 coordinate = this->result;
2150 }
2151
2152 src_reg shadow_comparitor;
2153 if (ir->shadow_comparitor) {
2154 ir->shadow_comparitor->accept(this);
2155 shadow_comparitor = this->result;
2156 }
2157
2158 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2159 src_reg lod, dPdx, dPdy, sample_index;
2160 switch (ir->op) {
2161 case ir_tex:
2162 lod = src_reg(0.0f);
2163 lod_type = glsl_type::float_type;
2164 break;
2165 case ir_txf:
2166 case ir_txl:
2167 case ir_txs:
2168 ir->lod_info.lod->accept(this);
2169 lod = this->result;
2170 lod_type = ir->lod_info.lod->type;
2171 break;
2172 case ir_txf_ms:
2173 ir->lod_info.sample_index->accept(this);
2174 sample_index = this->result;
2175 sample_index_type = ir->lod_info.sample_index->type;
2176 break;
2177 case ir_txd:
2178 ir->lod_info.grad.dPdx->accept(this);
2179 dPdx = this->result;
2180
2181 ir->lod_info.grad.dPdy->accept(this);
2182 dPdy = this->result;
2183
2184 lod_type = ir->lod_info.grad.dPdx->type;
2185 break;
2186 case ir_txb:
2187 case ir_lod:
2188 break;
2189 }
2190
2191 vec4_instruction *inst = NULL;
2192 switch (ir->op) {
2193 case ir_tex:
2194 case ir_txl:
2195 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2196 break;
2197 case ir_txd:
2198 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2199 break;
2200 case ir_txf:
2201 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2202 break;
2203 case ir_txf_ms:
2204 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2205 break;
2206 case ir_txs:
2207 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2208 break;
2209 case ir_txb:
2210 assert(!"TXB is not valid for vertex shaders.");
2211 break;
2212 case ir_lod:
2213 assert(!"LOD is not valid for vertex shaders.");
2214 break;
2215 }
2216
2217 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2218
2219 /* Texel offsets go in the message header; Gen4 also requires headers. */
2220 inst->header_present = use_texture_offset || intel->gen < 5;
2221 inst->base_mrf = 2;
2222 inst->mlen = inst->header_present + 1; /* always at least one */
2223 inst->sampler = sampler;
2224 inst->dst = dst_reg(this, ir->type);
2225 inst->dst.writemask = WRITEMASK_XYZW;
2226 inst->shadow_compare = ir->shadow_comparitor != NULL;
2227
2228 if (use_texture_offset)
2229 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2230
2231 /* MRF for the first parameter */
2232 int param_base = inst->base_mrf + inst->header_present;
2233
2234 if (ir->op == ir_txs) {
2235 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2236 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2237 } else {
2238 int i, coord_mask = 0, zero_mask = 0;
2239 /* Load the coordinate */
2240 /* FINISHME: gl_clamp_mask and saturate */
2241 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2242 coord_mask |= (1 << i);
2243 for (; i < 4; i++)
2244 zero_mask |= (1 << i);
2245
2246 if (ir->offset && ir->op == ir_txf) {
2247 /* It appears that the ld instruction used for txf does its
2248 * address bounds check before adding in the offset. To work
2249 * around this, just add the integer offset to the integer
2250 * texel coordinate, and don't put the offset in the header.
2251 */
2252 ir_constant *offset = ir->offset->as_constant();
2253 assert(offset);
2254
2255 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2256 src_reg src = coordinate;
2257 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2258 BRW_GET_SWZ(src.swizzle, j),
2259 BRW_GET_SWZ(src.swizzle, j),
2260 BRW_GET_SWZ(src.swizzle, j));
2261 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2262 src, offset->value.i[j]));
2263 }
2264 } else {
2265 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2266 coordinate));
2267 }
2268 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2269 src_reg(0)));
2270 /* Load the shadow comparitor */
2271 if (ir->shadow_comparitor) {
2272 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2273 WRITEMASK_X),
2274 shadow_comparitor));
2275 inst->mlen++;
2276 }
2277
2278 /* Load the LOD info */
2279 if (ir->op == ir_tex || ir->op == ir_txl) {
2280 int mrf, writemask;
2281 if (intel->gen >= 5) {
2282 mrf = param_base + 1;
2283 if (ir->shadow_comparitor) {
2284 writemask = WRITEMASK_Y;
2285 /* mlen already incremented */
2286 } else {
2287 writemask = WRITEMASK_X;
2288 inst->mlen++;
2289 }
2290 } else /* intel->gen == 4 */ {
2291 mrf = param_base;
2292 writemask = WRITEMASK_Z;
2293 }
2294 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2295 } else if (ir->op == ir_txf) {
2296 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2297 } else if (ir->op == ir_txf_ms) {
2298 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2299 sample_index));
2300 inst->mlen++;
2301
2302 /* on Gen7, there is an additional MCS parameter here after SI,
2303 * but we don't bother to emit it since it's always zero. If
2304 * we start supporting texturing from CMS surfaces, this will have
2305 * to change
2306 */
2307 } else if (ir->op == ir_txd) {
2308 const glsl_type *type = lod_type;
2309
2310 if (intel->gen >= 5) {
2311 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2312 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2313 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2314 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2315 inst->mlen++;
2316
2317 if (ir->type->vector_elements == 3) {
2318 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2319 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2320 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2321 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2322 inst->mlen++;
2323 }
2324 } else /* intel->gen == 4 */ {
2325 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2326 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2327 inst->mlen += 2;
2328 }
2329 }
2330 }
2331
2332 emit(inst);
2333
2334 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2335 * spec requires layers.
2336 */
2337 if (ir->op == ir_txs) {
2338 glsl_type const *type = ir->sampler->type;
2339 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2340 type->sampler_array) {
2341 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2342 with_writemask(inst->dst, WRITEMASK_Z),
2343 src_reg(inst->dst), src_reg(6));
2344 }
2345 }
2346
2347 swizzle_result(ir, src_reg(inst->dst), sampler);
2348 }
2349
2350 void
2351 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2352 {
2353 int s = key->tex.swizzles[sampler];
2354
2355 this->result = src_reg(this, ir->type);
2356 dst_reg swizzled_result(this->result);
2357
2358 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2359 || s == SWIZZLE_NOOP) {
2360 emit(MOV(swizzled_result, orig_val));
2361 return;
2362 }
2363
2364 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2365 int swizzle[4];
2366
2367 for (int i = 0; i < 4; i++) {
2368 switch (GET_SWZ(s, i)) {
2369 case SWIZZLE_ZERO:
2370 zero_mask |= (1 << i);
2371 break;
2372 case SWIZZLE_ONE:
2373 one_mask |= (1 << i);
2374 break;
2375 default:
2376 copy_mask |= (1 << i);
2377 swizzle[i] = GET_SWZ(s, i);
2378 break;
2379 }
2380 }
2381
2382 if (copy_mask) {
2383 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2384 swizzled_result.writemask = copy_mask;
2385 emit(MOV(swizzled_result, orig_val));
2386 }
2387
2388 if (zero_mask) {
2389 swizzled_result.writemask = zero_mask;
2390 emit(MOV(swizzled_result, src_reg(0.0f)));
2391 }
2392
2393 if (one_mask) {
2394 swizzled_result.writemask = one_mask;
2395 emit(MOV(swizzled_result, src_reg(1.0f)));
2396 }
2397 }
2398
2399 void
2400 vec4_visitor::visit(ir_return *ir)
2401 {
2402 assert(!"not reached");
2403 }
2404
2405 void
2406 vec4_visitor::visit(ir_discard *ir)
2407 {
2408 assert(!"not reached");
2409 }
2410
2411 void
2412 vec4_visitor::visit(ir_if *ir)
2413 {
2414 /* Don't point the annotation at the if statement, because then it plus
2415 * the then and else blocks get printed.
2416 */
2417 this->base_ir = ir->condition;
2418
2419 if (intel->gen == 6) {
2420 emit_if_gen6(ir);
2421 } else {
2422 uint32_t predicate;
2423 emit_bool_to_cond_code(ir->condition, &predicate);
2424 emit(IF(predicate));
2425 }
2426
2427 visit_instructions(&ir->then_instructions);
2428
2429 if (!ir->else_instructions.is_empty()) {
2430 this->base_ir = ir->condition;
2431 emit(BRW_OPCODE_ELSE);
2432
2433 visit_instructions(&ir->else_instructions);
2434 }
2435
2436 this->base_ir = ir->condition;
2437 emit(BRW_OPCODE_ENDIF);
2438 }
2439
2440 void
2441 vec4_visitor::emit_ndc_computation()
2442 {
2443 /* Get the position */
2444 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2445
2446 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2447 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2448 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2449
2450 current_annotation = "NDC";
2451 dst_reg ndc_w = ndc;
2452 ndc_w.writemask = WRITEMASK_W;
2453 src_reg pos_w = pos;
2454 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2455 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2456
2457 dst_reg ndc_xyz = ndc;
2458 ndc_xyz.writemask = WRITEMASK_XYZ;
2459
2460 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2461 }
2462
2463 void
2464 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2465 {
2466 if (intel->gen < 6 &&
2467 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2468 key->userclip_active || brw->has_negative_rhw_bug)) {
2469 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2470 dst_reg header1_w = header1;
2471 header1_w.writemask = WRITEMASK_W;
2472 GLuint i;
2473
2474 emit(MOV(header1, 0u));
2475
2476 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2477 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2478
2479 current_annotation = "Point size";
2480 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2481 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2482 }
2483
2484 current_annotation = "Clipping flags";
2485 for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2486 vec4_instruction *inst;
2487
2488 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2489 src_reg(this->userplane[i])));
2490 inst->conditional_mod = BRW_CONDITIONAL_L;
2491
2492 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2493 inst->predicate = BRW_PREDICATE_NORMAL;
2494 }
2495
2496 /* i965 clipping workaround:
2497 * 1) Test for -ve rhw
2498 * 2) If set,
2499 * set ndc = (0,0,0,0)
2500 * set ucp[6] = 1
2501 *
2502 * Later, clipping will detect ucp[6] and ensure the primitive is
2503 * clipped against all fixed planes.
2504 */
2505 if (brw->has_negative_rhw_bug) {
2506 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2507 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2508 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2509 vec4_instruction *inst;
2510 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2511 inst->predicate = BRW_PREDICATE_NORMAL;
2512 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2513 inst->predicate = BRW_PREDICATE_NORMAL;
2514 }
2515
2516 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2517 } else if (intel->gen < 6) {
2518 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2519 } else {
2520 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2521 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2522 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2523 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2524 }
2525 }
2526 }
2527
2528 void
2529 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2530 {
2531 if (intel->gen < 6) {
2532 /* Clip distance slots are set aside in gen5, but they are not used. It
2533 * is not clear whether we actually need to set aside space for them,
2534 * but the performance cost is negligible.
2535 */
2536 return;
2537 }
2538
2539 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2540 *
2541 * "If a linked set of shaders forming the vertex stage contains no
2542 * static write to gl_ClipVertex or gl_ClipDistance, but the
2543 * application has requested clipping against user clip planes through
2544 * the API, then the coordinate written to gl_Position is used for
2545 * comparison against the user clip planes."
2546 *
2547 * This function is only called if the shader didn't write to
2548 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2549 * if the user wrote to it; otherwise we use gl_Position.
2550 */
2551 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2552 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2553 clip_vertex = VARYING_SLOT_POS;
2554 }
2555
2556 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2557 ++i) {
2558 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2559 src_reg(output_reg[clip_vertex]),
2560 src_reg(this->userplane[i + offset])));
2561 }
2562 }
2563
2564 void
2565 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2566 {
2567 assert (varying < VARYING_SLOT_MAX);
2568 reg.type = output_reg[varying].type;
2569 current_annotation = output_reg_annotation[varying];
2570 /* Copy the register, saturating if necessary */
2571 vec4_instruction *inst = emit(MOV(reg,
2572 src_reg(output_reg[varying])));
2573 if ((varying == VARYING_SLOT_COL0 ||
2574 varying == VARYING_SLOT_COL1 ||
2575 varying == VARYING_SLOT_BFC0 ||
2576 varying == VARYING_SLOT_BFC1) &&
2577 key->clamp_vertex_color) {
2578 inst->saturate = true;
2579 }
2580 }
2581
2582 void
2583 vec4_visitor::emit_urb_slot(int mrf, int varying)
2584 {
2585 struct brw_reg hw_reg = brw_message_reg(mrf);
2586 dst_reg reg = dst_reg(MRF, mrf);
2587 reg.type = BRW_REGISTER_TYPE_F;
2588
2589 switch (varying) {
2590 case VARYING_SLOT_PSIZ:
2591 /* PSIZ is always in slot 0, and is coupled with other flags. */
2592 current_annotation = "indices, point width, clip flags";
2593 emit_psiz_and_flags(hw_reg);
2594 break;
2595 case BRW_VARYING_SLOT_NDC:
2596 current_annotation = "NDC";
2597 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2598 break;
2599 case BRW_VARYING_SLOT_POS_DUPLICATE:
2600 case VARYING_SLOT_POS:
2601 current_annotation = "gl_Position";
2602 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2603 break;
2604 case VARYING_SLOT_CLIP_DIST0:
2605 case VARYING_SLOT_CLIP_DIST1:
2606 if (this->key->uses_clip_distance) {
2607 emit_generic_urb_slot(reg, varying);
2608 } else {
2609 current_annotation = "user clip distances";
2610 emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2611 }
2612 break;
2613 case VARYING_SLOT_EDGE:
2614 /* This is present when doing unfilled polygons. We're supposed to copy
2615 * the edge flag from the user-provided vertex array
2616 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2617 * of that attribute (starts as 1.0f). This is then used in clipping to
2618 * determine which edges should be drawn as wireframe.
2619 */
2620 current_annotation = "edge flag";
2621 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2622 glsl_type::float_type, WRITEMASK_XYZW))));
2623 break;
2624 case BRW_VARYING_SLOT_PAD:
2625 /* No need to write to this slot */
2626 break;
2627 default:
2628 emit_generic_urb_slot(reg, varying);
2629 break;
2630 }
2631 }
2632
2633 static int
2634 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2635 {
2636 struct intel_context *intel = &brw->intel;
2637
2638 if (intel->gen >= 6) {
2639 /* URB data written (does not include the message header reg) must
2640 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2641 * section 5.4.3.2.2: URB_INTERLEAVED.
2642 *
2643 * URB entries are allocated on a multiple of 1024 bits, so an
2644 * extra 128 bits written here to make the end align to 256 is
2645 * no problem.
2646 */
2647 if ((mlen % 2) != 1)
2648 mlen++;
2649 }
2650
2651 return mlen;
2652 }
2653
2654 void
2655 vec4_vs_visitor::emit_urb_write_header(int mrf)
2656 {
2657 /* No need to do anything for VS; an implied write to this MRF will be
2658 * performed by VS_OPCODE_URB_WRITE.
2659 */
2660 (void) mrf;
2661 }
2662
2663 vec4_instruction *
2664 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2665 {
2666 /* For VS, the URB writes end the thread. */
2667 if (complete) {
2668 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2669 emit_shader_time_end();
2670 }
2671
2672 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2673 inst->eot = complete;
2674
2675 return inst;
2676 }
2677
2678 /**
2679 * Generates the VUE payload plus the necessary URB write instructions to
2680 * output it.
2681 *
2682 * The VUE layout is documented in Volume 2a.
2683 */
2684 void
2685 vec4_visitor::emit_vertex()
2686 {
2687 /* MRF 0 is reserved for the debugger, so start with message header
2688 * in MRF 1.
2689 */
2690 int base_mrf = 1;
2691 int mrf = base_mrf;
2692 /* In the process of generating our URB write message contents, we
2693 * may need to unspill a register or load from an array. Those
2694 * reads would use MRFs 14-15.
2695 */
2696 int max_usable_mrf = 13;
2697
2698 /* The following assertion verifies that max_usable_mrf causes an
2699 * even-numbered amount of URB write data, which will meet gen6's
2700 * requirements for length alignment.
2701 */
2702 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2703
2704 /* First mrf is the g0-based message header containing URB handles and
2705 * such.
2706 */
2707 emit_urb_write_header(mrf++);
2708
2709 if (intel->gen < 6) {
2710 emit_ndc_computation();
2711 }
2712
2713 /* Set up the VUE data for the first URB write */
2714 int slot;
2715 for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2716 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2717
2718 /* If this was max_usable_mrf, we can't fit anything more into this URB
2719 * WRITE.
2720 */
2721 if (mrf > max_usable_mrf) {
2722 slot++;
2723 break;
2724 }
2725 }
2726
2727 bool complete = slot >= prog_data->vue_map.num_slots;
2728 current_annotation = "URB write";
2729 vec4_instruction *inst = emit_urb_write_opcode(complete);
2730 inst->base_mrf = base_mrf;
2731 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2732
2733 /* Optional second URB write */
2734 if (!complete) {
2735 mrf = base_mrf + 1;
2736
2737 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2738 assert(mrf < max_usable_mrf);
2739
2740 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2741 }
2742
2743 current_annotation = "URB write";
2744 inst = emit_urb_write_opcode(true /* complete */);
2745 inst->base_mrf = base_mrf;
2746 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2747 /* URB destination offset. In the previous write, we got MRFs
2748 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2749 * URB row increments, and each of our MRFs is half of one of
2750 * those, since we're doing interleaved writes.
2751 */
2752 inst->offset = (max_usable_mrf - base_mrf) / 2;
2753 }
2754 }
2755
2756 void
2757 vec4_vs_visitor::emit_thread_end()
2758 {
2759 /* For VS, we always end the thread by emitting a single vertex.
2760 * emit_urb_write_opcode() will take care of setting the eot flag on the
2761 * SEND instruction.
2762 */
2763 emit_vertex();
2764 }
2765
2766 src_reg
2767 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2768 src_reg *reladdr, int reg_offset)
2769 {
2770 /* Because we store the values to scratch interleaved like our
2771 * vertex data, we need to scale the vec4 index by 2.
2772 */
2773 int message_header_scale = 2;
2774
2775 /* Pre-gen6, the message header uses byte offsets instead of vec4
2776 * (16-byte) offset units.
2777 */
2778 if (intel->gen < 6)
2779 message_header_scale *= 16;
2780
2781 if (reladdr) {
2782 src_reg index = src_reg(this, glsl_type::int_type);
2783
2784 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2785 emit_before(inst, MUL(dst_reg(index),
2786 index, src_reg(message_header_scale)));
2787
2788 return index;
2789 } else {
2790 return src_reg(reg_offset * message_header_scale);
2791 }
2792 }
2793
2794 src_reg
2795 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2796 src_reg *reladdr, int reg_offset)
2797 {
2798 if (reladdr) {
2799 src_reg index = src_reg(this, glsl_type::int_type);
2800
2801 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2802
2803 /* Pre-gen6, the message header uses byte offsets instead of vec4
2804 * (16-byte) offset units.
2805 */
2806 if (intel->gen < 6) {
2807 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2808 }
2809
2810 return index;
2811 } else {
2812 int message_header_scale = intel->gen < 6 ? 16 : 1;
2813 return src_reg(reg_offset * message_header_scale);
2814 }
2815 }
2816
2817 /**
2818 * Emits an instruction before @inst to load the value named by @orig_src
2819 * from scratch space at @base_offset to @temp.
2820 *
2821 * @base_offset is measured in 32-byte units (the size of a register).
2822 */
2823 void
2824 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2825 dst_reg temp, src_reg orig_src,
2826 int base_offset)
2827 {
2828 int reg_offset = base_offset + orig_src.reg_offset;
2829 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2830
2831 emit_before(inst, SCRATCH_READ(temp, index));
2832 }
2833
2834 /**
2835 * Emits an instruction after @inst to store the value to be written
2836 * to @orig_dst to scratch space at @base_offset, from @temp.
2837 *
2838 * @base_offset is measured in 32-byte units (the size of a register).
2839 */
2840 void
2841 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2842 {
2843 int reg_offset = base_offset + inst->dst.reg_offset;
2844 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2845
2846 /* Create a temporary register to store *inst's result in.
2847 *
2848 * We have to be careful in MOVing from our temporary result register in
2849 * the scratch write. If we swizzle from channels of the temporary that
2850 * weren't initialized, it will confuse live interval analysis, which will
2851 * make spilling fail to make progress.
2852 */
2853 src_reg temp = src_reg(this, glsl_type::vec4_type);
2854 temp.type = inst->dst.type;
2855 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2856 int swizzles[4];
2857 for (int i = 0; i < 4; i++)
2858 if (inst->dst.writemask & (1 << i))
2859 swizzles[i] = i;
2860 else
2861 swizzles[i] = first_writemask_chan;
2862 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2863 swizzles[2], swizzles[3]);
2864
2865 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2866 inst->dst.writemask));
2867 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2868 write->predicate = inst->predicate;
2869 write->ir = inst->ir;
2870 write->annotation = inst->annotation;
2871 inst->insert_after(write);
2872
2873 inst->dst.file = temp.file;
2874 inst->dst.reg = temp.reg;
2875 inst->dst.reg_offset = temp.reg_offset;
2876 inst->dst.reladdr = NULL;
2877 }
2878
2879 /**
2880 * We can't generally support array access in GRF space, because a
2881 * single instruction's destination can only span 2 contiguous
2882 * registers. So, we send all GRF arrays that get variable index
2883 * access to scratch space.
2884 */
2885 void
2886 vec4_visitor::move_grf_array_access_to_scratch()
2887 {
2888 int scratch_loc[this->virtual_grf_count];
2889
2890 for (int i = 0; i < this->virtual_grf_count; i++) {
2891 scratch_loc[i] = -1;
2892 }
2893
2894 /* First, calculate the set of virtual GRFs that need to be punted
2895 * to scratch due to having any array access on them, and where in
2896 * scratch.
2897 */
2898 foreach_list(node, &this->instructions) {
2899 vec4_instruction *inst = (vec4_instruction *)node;
2900
2901 if (inst->dst.file == GRF && inst->dst.reladdr &&
2902 scratch_loc[inst->dst.reg] == -1) {
2903 scratch_loc[inst->dst.reg] = c->last_scratch;
2904 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2905 }
2906
2907 for (int i = 0 ; i < 3; i++) {
2908 src_reg *src = &inst->src[i];
2909
2910 if (src->file == GRF && src->reladdr &&
2911 scratch_loc[src->reg] == -1) {
2912 scratch_loc[src->reg] = c->last_scratch;
2913 c->last_scratch += this->virtual_grf_sizes[src->reg];
2914 }
2915 }
2916 }
2917
2918 /* Now, for anything that will be accessed through scratch, rewrite
2919 * it to load/store. Note that this is a _safe list walk, because
2920 * we may generate a new scratch_write instruction after the one
2921 * we're processing.
2922 */
2923 foreach_list_safe(node, &this->instructions) {
2924 vec4_instruction *inst = (vec4_instruction *)node;
2925
2926 /* Set up the annotation tracking for new generated instructions. */
2927 base_ir = inst->ir;
2928 current_annotation = inst->annotation;
2929
2930 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2931 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2932 }
2933
2934 for (int i = 0 ; i < 3; i++) {
2935 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2936 continue;
2937
2938 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2939
2940 emit_scratch_read(inst, temp, inst->src[i],
2941 scratch_loc[inst->src[i].reg]);
2942
2943 inst->src[i].file = temp.file;
2944 inst->src[i].reg = temp.reg;
2945 inst->src[i].reg_offset = temp.reg_offset;
2946 inst->src[i].reladdr = NULL;
2947 }
2948 }
2949 }
2950
2951 /**
2952 * Emits an instruction before @inst to load the value named by @orig_src
2953 * from the pull constant buffer (surface) at @base_offset to @temp.
2954 */
2955 void
2956 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2957 dst_reg temp, src_reg orig_src,
2958 int base_offset)
2959 {
2960 int reg_offset = base_offset + orig_src.reg_offset;
2961 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2962 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2963 vec4_instruction *load;
2964
2965 if (intel->gen >= 7) {
2966 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2967 grf_offset.type = offset.type;
2968 emit_before(inst, MOV(grf_offset, offset));
2969
2970 load = new(mem_ctx) vec4_instruction(this,
2971 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2972 temp, index, src_reg(grf_offset));
2973 } else {
2974 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2975 temp, index, offset);
2976 load->base_mrf = 14;
2977 load->mlen = 1;
2978 }
2979 emit_before(inst, load);
2980 }
2981
2982 /**
2983 * Implements array access of uniforms by inserting a
2984 * PULL_CONSTANT_LOAD instruction.
2985 *
2986 * Unlike temporary GRF array access (where we don't support it due to
2987 * the difficulty of doing relative addressing on instruction
2988 * destinations), we could potentially do array access of uniforms
2989 * that were loaded in GRF space as push constants. In real-world
2990 * usage we've seen, though, the arrays being used are always larger
2991 * than we could load as push constants, so just always move all
2992 * uniform array access out to a pull constant buffer.
2993 */
2994 void
2995 vec4_visitor::move_uniform_array_access_to_pull_constants()
2996 {
2997 int pull_constant_loc[this->uniforms];
2998
2999 for (int i = 0; i < this->uniforms; i++) {
3000 pull_constant_loc[i] = -1;
3001 }
3002
3003 /* Walk through and find array access of uniforms. Put a copy of that
3004 * uniform in the pull constant buffer.
3005 *
3006 * Note that we don't move constant-indexed accesses to arrays. No
3007 * testing has been done of the performance impact of this choice.
3008 */
3009 foreach_list_safe(node, &this->instructions) {
3010 vec4_instruction *inst = (vec4_instruction *)node;
3011
3012 for (int i = 0 ; i < 3; i++) {
3013 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3014 continue;
3015
3016 int uniform = inst->src[i].reg;
3017
3018 /* If this array isn't already present in the pull constant buffer,
3019 * add it.
3020 */
3021 if (pull_constant_loc[uniform] == -1) {
3022 const float **values = &prog_data->param[uniform * 4];
3023
3024 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3025
3026 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3027 prog_data->pull_param[prog_data->nr_pull_params++]
3028 = values[j];
3029 }
3030 }
3031
3032 /* Set up the annotation tracking for new generated instructions. */
3033 base_ir = inst->ir;
3034 current_annotation = inst->annotation;
3035
3036 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3037
3038 emit_pull_constant_load(inst, temp, inst->src[i],
3039 pull_constant_loc[uniform]);
3040
3041 inst->src[i].file = temp.file;
3042 inst->src[i].reg = temp.reg;
3043 inst->src[i].reg_offset = temp.reg_offset;
3044 inst->src[i].reladdr = NULL;
3045 }
3046 }
3047
3048 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3049 * no need to track them as larger-than-vec4 objects. This will be
3050 * relied on in cutting out unused uniform vectors from push
3051 * constants.
3052 */
3053 split_uniform_registers();
3054 }
3055
3056 void
3057 vec4_visitor::resolve_ud_negate(src_reg *reg)
3058 {
3059 if (reg->type != BRW_REGISTER_TYPE_UD ||
3060 !reg->negate)
3061 return;
3062
3063 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3064 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3065 *reg = temp;
3066 }
3067
3068 vec4_visitor::vec4_visitor(struct brw_context *brw,
3069 struct brw_vec4_compile *c,
3070 struct gl_program *prog,
3071 const struct brw_vec4_prog_key *key,
3072 struct brw_vec4_prog_data *prog_data,
3073 struct gl_shader_program *shader_prog,
3074 struct brw_shader *shader,
3075 void *mem_ctx,
3076 bool debug_flag)
3077 : debug_flag(debug_flag)
3078 {
3079 this->brw = brw;
3080 this->intel = &brw->intel;
3081 this->ctx = &intel->ctx;
3082 this->shader_prog = shader_prog;
3083 this->shader = shader;
3084
3085 this->mem_ctx = mem_ctx;
3086 this->failed = false;
3087
3088 this->base_ir = NULL;
3089 this->current_annotation = NULL;
3090 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3091
3092 this->c = c;
3093 this->prog = prog;
3094 this->key = key;
3095 this->prog_data = prog_data;
3096
3097 this->variable_ht = hash_table_ctor(0,
3098 hash_table_pointer_hash,
3099 hash_table_pointer_compare);
3100
3101 this->virtual_grf_def = NULL;
3102 this->virtual_grf_use = NULL;
3103 this->virtual_grf_sizes = NULL;
3104 this->virtual_grf_count = 0;
3105 this->virtual_grf_reg_map = NULL;
3106 this->virtual_grf_reg_count = 0;
3107 this->virtual_grf_array_size = 0;
3108 this->live_intervals_valid = false;
3109
3110 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3111
3112 this->uniforms = 0;
3113 }
3114
3115 vec4_visitor::~vec4_visitor()
3116 {
3117 hash_table_dtor(this->variable_ht);
3118 }
3119
3120
3121 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3122 struct brw_vs_compile *vs_compile,
3123 struct brw_vs_prog_data *vs_prog_data,
3124 struct gl_shader_program *prog,
3125 struct brw_shader *shader,
3126 void *mem_ctx)
3127 : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3128 &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3129 mem_ctx, INTEL_DEBUG & DEBUG_VS),
3130 vs_compile(vs_compile),
3131 vs_prog_data(vs_prog_data)
3132 {
3133 }
3134
3135
3136 void
3137 vec4_visitor::fail(const char *format, ...)
3138 {
3139 va_list va;
3140 char *msg;
3141
3142 if (failed)
3143 return;
3144
3145 failed = true;
3146
3147 va_start(va, format);
3148 msg = ralloc_vasprintf(mem_ctx, format, va);
3149 va_end(va);
3150 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3151
3152 this->fail_msg = msg;
3153
3154 if (debug_flag) {
3155 fprintf(stderr, "%s", msg);
3156 }
3157 }
3158
3159 } /* namespace brw */