i965/vs/gen7: Emit code for GLSL ES 3.00 pack/unpack operations (v3)
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 ALU1(NOT)
111 ALU1(MOV)
112 ALU1(FRC)
113 ALU1(RNDD)
114 ALU1(RNDE)
115 ALU1(RNDZ)
116 ALU1(F32TO16)
117 ALU1(F16TO32)
118 ALU2(ADD)
119 ALU2(MUL)
120 ALU2(MACH)
121 ALU2(AND)
122 ALU2(OR)
123 ALU2(XOR)
124 ALU2(DP3)
125 ALU2(DP4)
126 ALU2(DPH)
127 ALU2(SHL)
128 ALU2(SHR)
129 ALU2(ASR)
130
131 /** Gen4 predicated IF. */
132 vec4_instruction *
133 vec4_visitor::IF(uint32_t predicate)
134 {
135 vec4_instruction *inst;
136
137 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
138 inst->predicate = predicate;
139
140 return inst;
141 }
142
143 /** Gen6+ IF with embedded comparison. */
144 vec4_instruction *
145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
146 {
147 assert(intel->gen >= 6);
148
149 vec4_instruction *inst;
150
151 resolve_ud_negate(&src0);
152 resolve_ud_negate(&src1);
153
154 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
155 src0, src1);
156 inst->conditional_mod = condition;
157
158 return inst;
159 }
160
161 /**
162 * CMP: Sets the low bit of the destination channels with the result
163 * of the comparison, while the upper bits are undefined, and updates
164 * the flag register with the packed 16 bits of the result.
165 */
166 vec4_instruction *
167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
168 {
169 vec4_instruction *inst;
170
171 /* original gen4 does type conversion to the destination type
172 * before before comparison, producing garbage results for floating
173 * point comparisons.
174 */
175 if (intel->gen == 4) {
176 dst.type = src0.type;
177 if (dst.file == HW_REG)
178 dst.fixed_hw_reg.type = dst.type;
179 }
180
181 resolve_ud_negate(&src0);
182 resolve_ud_negate(&src1);
183
184 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
185 inst->conditional_mod = condition;
186
187 return inst;
188 }
189
190 vec4_instruction *
191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
196 dst, index);
197 inst->base_mrf = 14;
198 inst->mlen = 2;
199
200 return inst;
201 }
202
203 vec4_instruction *
204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
205 {
206 vec4_instruction *inst;
207
208 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
209 dst, src, index);
210 inst->base_mrf = 13;
211 inst->mlen = 3;
212
213 return inst;
214 }
215
216 void
217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
218 {
219 static enum opcode dot_opcodes[] = {
220 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
221 };
222
223 emit(dot_opcodes[elements - 2], dst, src0, src1);
224 }
225
226 src_reg
227 vec4_visitor::fix_math_operand(src_reg src)
228 {
229 /* The gen6 math instruction ignores the source modifiers --
230 * swizzle, abs, negate, and at least some parts of the register
231 * region description.
232 *
233 * Rather than trying to enumerate all these cases, *always* expand the
234 * operand to a temp GRF for gen6.
235 *
236 * For gen7, keep the operand as-is, except if immediate, which gen7 still
237 * can't use.
238 */
239
240 if (intel->gen == 7 && src.file != IMM)
241 return src;
242
243 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
244 expanded.type = src.type;
245 emit(MOV(expanded, src));
246 return src_reg(expanded);
247 }
248
249 void
250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
251 {
252 src = fix_math_operand(src);
253
254 if (dst.writemask != WRITEMASK_XYZW) {
255 /* The gen6 math instruction must be align1, so we can't do
256 * writemasks.
257 */
258 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
259
260 emit(opcode, temp_dst, src);
261
262 emit(MOV(dst, src_reg(temp_dst)));
263 } else {
264 emit(opcode, dst, src);
265 }
266 }
267
268 void
269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
270 {
271 vec4_instruction *inst = emit(opcode, dst, src);
272 inst->base_mrf = 1;
273 inst->mlen = 1;
274 }
275
276 void
277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
278 {
279 switch (opcode) {
280 case SHADER_OPCODE_RCP:
281 case SHADER_OPCODE_RSQ:
282 case SHADER_OPCODE_SQRT:
283 case SHADER_OPCODE_EXP2:
284 case SHADER_OPCODE_LOG2:
285 case SHADER_OPCODE_SIN:
286 case SHADER_OPCODE_COS:
287 break;
288 default:
289 assert(!"not reached: bad math opcode");
290 return;
291 }
292
293 if (intel->gen >= 6) {
294 return emit_math1_gen6(opcode, dst, src);
295 } else {
296 return emit_math1_gen4(opcode, dst, src);
297 }
298 }
299
300 void
301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
302 dst_reg dst, src_reg src0, src_reg src1)
303 {
304 src0 = fix_math_operand(src0);
305 src1 = fix_math_operand(src1);
306
307 if (dst.writemask != WRITEMASK_XYZW) {
308 /* The gen6 math instruction must be align1, so we can't do
309 * writemasks.
310 */
311 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
312 temp_dst.type = dst.type;
313
314 emit(opcode, temp_dst, src0, src1);
315
316 emit(MOV(dst, src_reg(temp_dst)));
317 } else {
318 emit(opcode, dst, src0, src1);
319 }
320 }
321
322 void
323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
324 dst_reg dst, src_reg src0, src_reg src1)
325 {
326 vec4_instruction *inst = emit(opcode, dst, src0, src1);
327 inst->base_mrf = 1;
328 inst->mlen = 2;
329 }
330
331 void
332 vec4_visitor::emit_math(enum opcode opcode,
333 dst_reg dst, src_reg src0, src_reg src1)
334 {
335 switch (opcode) {
336 case SHADER_OPCODE_POW:
337 case SHADER_OPCODE_INT_QUOTIENT:
338 case SHADER_OPCODE_INT_REMAINDER:
339 break;
340 default:
341 assert(!"not reached: unsupported binary math opcode");
342 return;
343 }
344
345 if (intel->gen >= 6) {
346 return emit_math2_gen6(opcode, dst, src0, src1);
347 } else {
348 return emit_math2_gen4(opcode, dst, src0, src1);
349 }
350 }
351
352 void
353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
354 {
355 if (intel->gen < 7)
356 assert(!"ir_unop_pack_half_2x16 should be lowered");
357
358 assert(dst.type == BRW_REGISTER_TYPE_UD);
359 assert(src0.type == BRW_REGISTER_TYPE_F);
360
361 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
362 *
363 * Because this instruction does not have a 16-bit floating-point type,
364 * the destination data type must be Word (W).
365 *
366 * The destination must be DWord-aligned and specify a horizontal stride
367 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
368 * each destination channel and the upper word is not modified.
369 *
370 * The above restriction implies that the f32to16 instruction must use
371 * align1 mode, because only in align1 mode is it possible to specify
372 * horizontal stride. We choose here to defy the hardware docs and emit
373 * align16 instructions.
374 *
375 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
376 * instructions. I was partially successful in that the code passed all
377 * tests. However, the code was dubiously correct and fragile, and the
378 * tests were not harsh enough to probe that frailty. Not trusting the
379 * code, I chose instead to remain in align16 mode in defiance of the hw
380 * docs).
381 *
382 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
383 * simulator, emitting a f32to16 in align16 mode with UD as destination
384 * data type is safe. The behavior differs from that specified in the PRM
385 * in that the upper word of each destination channel is cleared to 0.
386 */
387
388 dst_reg tmp_dst(this, glsl_type::uvec2_type);
389 src_reg tmp_src(tmp_dst);
390
391 #if 0
392 /* Verify the undocumented behavior on which the following instructions
393 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
394 * then the result of the bit-or instruction below will be incorrect.
395 *
396 * You should inspect the disasm output in order to verify that the MOV is
397 * not optimized away.
398 */
399 emit(MOV(tmp_dst, src_reg(0x12345678u)));
400 #endif
401
402 /* Give tmp the form below, where "." means untouched.
403 *
404 * w z y x w z y x
405 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
406 *
407 * That the upper word of each write-channel be 0 is required for the
408 * following bit-shift and bit-or instructions to work. Note that this
409 * relies on the undocumented hardware behavior mentioned above.
410 */
411 tmp_dst.writemask = WRITEMASK_XY;
412 emit(F32TO16(tmp_dst, src0));
413
414 /* Give the write-channels of dst the form:
415 * 0xhhhh0000
416 */
417 tmp_src.swizzle = SWIZZLE_Y;
418 emit(SHL(dst, tmp_src, src_reg(16u)));
419
420 /* Finally, give the write-channels of dst the form of packHalf2x16's
421 * output:
422 * 0xhhhhllll
423 */
424 tmp_src.swizzle = SWIZZLE_X;
425 emit(OR(dst, src_reg(dst), tmp_src));
426 }
427
428 void
429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
430 {
431 if (intel->gen < 7)
432 assert(!"ir_unop_unpack_half_2x16 should be lowered");
433
434 assert(dst.type == BRW_REGISTER_TYPE_F);
435 assert(src0.type == BRW_REGISTER_TYPE_UD);
436
437 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
438 *
439 * Because this instruction does not have a 16-bit floating-point type,
440 * the source data type must be Word (W). The destination type must be
441 * F (Float).
442 *
443 * To use W as the source data type, we must adjust horizontal strides,
444 * which is only possible in align1 mode. All my [chadv] attempts at
445 * emitting align1 instructions for unpackHalf2x16 failed to pass the
446 * Piglit tests, so I gave up.
447 *
448 * I've verified that, on gen7 hardware and the simulator, it is safe to
449 * emit f16to32 in align16 mode with UD as source data type.
450 */
451
452 dst_reg tmp_dst(this, glsl_type::uvec2_type);
453 src_reg tmp_src(tmp_dst);
454
455 tmp_dst.writemask = WRITEMASK_X;
456 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
457
458 tmp_dst.writemask = WRITEMASK_Y;
459 emit(SHR(tmp_dst, src0, src_reg(16u)));
460
461 dst.writemask = WRITEMASK_XY;
462 emit(F16TO32(dst, tmp_src));
463 }
464
465 void
466 vec4_visitor::visit_instructions(const exec_list *list)
467 {
468 foreach_list(node, list) {
469 ir_instruction *ir = (ir_instruction *)node;
470
471 base_ir = ir;
472 ir->accept(this);
473 }
474 }
475
476
477 static int
478 type_size(const struct glsl_type *type)
479 {
480 unsigned int i;
481 int size;
482
483 switch (type->base_type) {
484 case GLSL_TYPE_UINT:
485 case GLSL_TYPE_INT:
486 case GLSL_TYPE_FLOAT:
487 case GLSL_TYPE_BOOL:
488 if (type->is_matrix()) {
489 return type->matrix_columns;
490 } else {
491 /* Regardless of size of vector, it gets a vec4. This is bad
492 * packing for things like floats, but otherwise arrays become a
493 * mess. Hopefully a later pass over the code can pack scalars
494 * down if appropriate.
495 */
496 return 1;
497 }
498 case GLSL_TYPE_ARRAY:
499 assert(type->length > 0);
500 return type_size(type->fields.array) * type->length;
501 case GLSL_TYPE_STRUCT:
502 size = 0;
503 for (i = 0; i < type->length; i++) {
504 size += type_size(type->fields.structure[i].type);
505 }
506 return size;
507 case GLSL_TYPE_SAMPLER:
508 /* Samplers take up one slot in UNIFORMS[], but they're baked in
509 * at link time.
510 */
511 return 1;
512 default:
513 assert(0);
514 return 0;
515 }
516 }
517
518 int
519 vec4_visitor::virtual_grf_alloc(int size)
520 {
521 if (virtual_grf_array_size <= virtual_grf_count) {
522 if (virtual_grf_array_size == 0)
523 virtual_grf_array_size = 16;
524 else
525 virtual_grf_array_size *= 2;
526 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
527 virtual_grf_array_size);
528 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
529 virtual_grf_array_size);
530 }
531 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
532 virtual_grf_reg_count += size;
533 virtual_grf_sizes[virtual_grf_count] = size;
534 return virtual_grf_count++;
535 }
536
537 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
538 {
539 init();
540
541 this->file = GRF;
542 this->reg = v->virtual_grf_alloc(type_size(type));
543
544 if (type->is_array() || type->is_record()) {
545 this->swizzle = BRW_SWIZZLE_NOOP;
546 } else {
547 this->swizzle = swizzle_for_size(type->vector_elements);
548 }
549
550 this->type = brw_type_for_base_type(type);
551 }
552
553 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
554 {
555 init();
556
557 this->file = GRF;
558 this->reg = v->virtual_grf_alloc(type_size(type));
559
560 if (type->is_array() || type->is_record()) {
561 this->writemask = WRITEMASK_XYZW;
562 } else {
563 this->writemask = (1 << type->vector_elements) - 1;
564 }
565
566 this->type = brw_type_for_base_type(type);
567 }
568
569 /* Our support for uniforms is piggy-backed on the struct
570 * gl_fragment_program, because that's where the values actually
571 * get stored, rather than in some global gl_shader_program uniform
572 * store.
573 */
574 void
575 vec4_visitor::setup_uniform_values(ir_variable *ir)
576 {
577 int namelen = strlen(ir->name);
578
579 /* The data for our (non-builtin) uniforms is stored in a series of
580 * gl_uniform_driver_storage structs for each subcomponent that
581 * glGetUniformLocation() could name. We know it's been set up in the same
582 * order we'd walk the type, so walk the list of storage and find anything
583 * with our name, or the prefix of a component that starts with our name.
584 */
585 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
586 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
587
588 if (strncmp(ir->name, storage->name, namelen) != 0 ||
589 (storage->name[namelen] != 0 &&
590 storage->name[namelen] != '.' &&
591 storage->name[namelen] != '[')) {
592 continue;
593 }
594
595 gl_constant_value *components = storage->storage;
596 unsigned vector_count = (MAX2(storage->array_elements, 1) *
597 storage->type->matrix_columns);
598
599 for (unsigned s = 0; s < vector_count; s++) {
600 uniform_vector_size[uniforms] = storage->type->vector_elements;
601
602 int i;
603 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
604 c->prog_data.param[uniforms * 4 + i] = &components->f;
605 components++;
606 }
607 for (; i < 4; i++) {
608 static float zero = 0;
609 c->prog_data.param[uniforms * 4 + i] = &zero;
610 }
611
612 uniforms++;
613 }
614 }
615 }
616
617 void
618 vec4_visitor::setup_uniform_clipplane_values()
619 {
620 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
621
622 if (intel->gen < 6) {
623 /* Pre-Gen6, we compact clip planes. For example, if the user
624 * enables just clip planes 0, 1, and 3, we will enable clip planes
625 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
626 * plane 2. This simplifies the implementation of the Gen6 clip
627 * thread.
628 */
629 int compacted_clipplane_index = 0;
630 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
631 if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
632 continue;
633
634 this->uniform_vector_size[this->uniforms] = 4;
635 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
636 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
637 for (int j = 0; j < 4; ++j) {
638 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
639 }
640 ++compacted_clipplane_index;
641 ++this->uniforms;
642 }
643 } else {
644 /* In Gen6 and later, we don't compact clip planes, because this
645 * simplifies the implementation of gl_ClipDistance.
646 */
647 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
648 this->uniform_vector_size[this->uniforms] = 4;
649 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
650 this->userplane[i].type = BRW_REGISTER_TYPE_F;
651 for (int j = 0; j < 4; ++j) {
652 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
653 }
654 ++this->uniforms;
655 }
656 }
657 }
658
659 /* Our support for builtin uniforms is even scarier than non-builtin.
660 * It sits on top of the PROG_STATE_VAR parameters that are
661 * automatically updated from GL context state.
662 */
663 void
664 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
665 {
666 const ir_state_slot *const slots = ir->state_slots;
667 assert(ir->state_slots != NULL);
668
669 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
670 /* This state reference has already been setup by ir_to_mesa,
671 * but we'll get the same index back here. We can reference
672 * ParameterValues directly, since unlike brw_fs.cpp, we never
673 * add new state references during compile.
674 */
675 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
676 (gl_state_index *)slots[i].tokens);
677 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
678
679 this->uniform_vector_size[this->uniforms] = 0;
680 /* Add each of the unique swizzled channels of the element.
681 * This will end up matching the size of the glsl_type of this field.
682 */
683 int last_swiz = -1;
684 for (unsigned int j = 0; j < 4; j++) {
685 int swiz = GET_SWZ(slots[i].swizzle, j);
686 last_swiz = swiz;
687
688 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
689 if (swiz <= last_swiz)
690 this->uniform_vector_size[this->uniforms]++;
691 }
692 this->uniforms++;
693 }
694 }
695
696 dst_reg *
697 vec4_visitor::variable_storage(ir_variable *var)
698 {
699 return (dst_reg *)hash_table_find(this->variable_ht, var);
700 }
701
702 void
703 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
704 {
705 ir_expression *expr = ir->as_expression();
706
707 *predicate = BRW_PREDICATE_NORMAL;
708
709 if (expr) {
710 src_reg op[2];
711 vec4_instruction *inst;
712
713 assert(expr->get_num_operands() <= 2);
714 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
715 expr->operands[i]->accept(this);
716 op[i] = this->result;
717
718 resolve_ud_negate(&op[i]);
719 }
720
721 switch (expr->operation) {
722 case ir_unop_logic_not:
723 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
724 inst->conditional_mod = BRW_CONDITIONAL_Z;
725 break;
726
727 case ir_binop_logic_xor:
728 inst = emit(XOR(dst_null_d(), op[0], op[1]));
729 inst->conditional_mod = BRW_CONDITIONAL_NZ;
730 break;
731
732 case ir_binop_logic_or:
733 inst = emit(OR(dst_null_d(), op[0], op[1]));
734 inst->conditional_mod = BRW_CONDITIONAL_NZ;
735 break;
736
737 case ir_binop_logic_and:
738 inst = emit(AND(dst_null_d(), op[0], op[1]));
739 inst->conditional_mod = BRW_CONDITIONAL_NZ;
740 break;
741
742 case ir_unop_f2b:
743 if (intel->gen >= 6) {
744 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
745 } else {
746 inst = emit(MOV(dst_null_f(), op[0]));
747 inst->conditional_mod = BRW_CONDITIONAL_NZ;
748 }
749 break;
750
751 case ir_unop_i2b:
752 if (intel->gen >= 6) {
753 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
754 } else {
755 inst = emit(MOV(dst_null_d(), op[0]));
756 inst->conditional_mod = BRW_CONDITIONAL_NZ;
757 }
758 break;
759
760 case ir_binop_all_equal:
761 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
762 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
763 break;
764
765 case ir_binop_any_nequal:
766 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
767 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
768 break;
769
770 case ir_unop_any:
771 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
772 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
773 break;
774
775 case ir_binop_greater:
776 case ir_binop_gequal:
777 case ir_binop_less:
778 case ir_binop_lequal:
779 case ir_binop_equal:
780 case ir_binop_nequal:
781 emit(CMP(dst_null_d(), op[0], op[1],
782 brw_conditional_for_comparison(expr->operation)));
783 break;
784
785 default:
786 assert(!"not reached");
787 break;
788 }
789 return;
790 }
791
792 ir->accept(this);
793
794 resolve_ud_negate(&this->result);
795
796 if (intel->gen >= 6) {
797 vec4_instruction *inst = emit(AND(dst_null_d(),
798 this->result, src_reg(1)));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 } else {
801 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
802 inst->conditional_mod = BRW_CONDITIONAL_NZ;
803 }
804 }
805
806 /**
807 * Emit a gen6 IF statement with the comparison folded into the IF
808 * instruction.
809 */
810 void
811 vec4_visitor::emit_if_gen6(ir_if *ir)
812 {
813 ir_expression *expr = ir->condition->as_expression();
814
815 if (expr) {
816 src_reg op[2];
817 dst_reg temp;
818
819 assert(expr->get_num_operands() <= 2);
820 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
821 expr->operands[i]->accept(this);
822 op[i] = this->result;
823 }
824
825 switch (expr->operation) {
826 case ir_unop_logic_not:
827 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
828 return;
829
830 case ir_binop_logic_xor:
831 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
832 return;
833
834 case ir_binop_logic_or:
835 temp = dst_reg(this, glsl_type::bool_type);
836 emit(OR(temp, op[0], op[1]));
837 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
838 return;
839
840 case ir_binop_logic_and:
841 temp = dst_reg(this, glsl_type::bool_type);
842 emit(AND(temp, op[0], op[1]));
843 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
844 return;
845
846 case ir_unop_f2b:
847 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
848 return;
849
850 case ir_unop_i2b:
851 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
852 return;
853
854 case ir_binop_greater:
855 case ir_binop_gequal:
856 case ir_binop_less:
857 case ir_binop_lequal:
858 case ir_binop_equal:
859 case ir_binop_nequal:
860 emit(IF(op[0], op[1],
861 brw_conditional_for_comparison(expr->operation)));
862 return;
863
864 case ir_binop_all_equal:
865 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
866 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
867 return;
868
869 case ir_binop_any_nequal:
870 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
871 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
872 return;
873
874 case ir_unop_any:
875 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
876 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
877 return;
878
879 default:
880 assert(!"not reached");
881 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
882 return;
883 }
884 return;
885 }
886
887 ir->condition->accept(this);
888
889 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
890 }
891
892 static dst_reg
893 with_writemask(dst_reg const & r, int mask)
894 {
895 dst_reg result = r;
896 result.writemask = mask;
897 return result;
898 }
899
900 void
901 vec4_visitor::emit_attribute_fixups()
902 {
903 dst_reg sign_recovery_shift;
904 dst_reg normalize_factor;
905 dst_reg es3_normalize_factor;
906
907 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
908 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
909 uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
910 dst_reg reg(ATTR, i);
911 dst_reg reg_d = reg;
912 reg_d.type = BRW_REGISTER_TYPE_D;
913 dst_reg reg_ud = reg;
914 reg_ud.type = BRW_REGISTER_TYPE_UD;
915
916 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
917 * come in as floating point conversions of the integer values.
918 */
919 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
920 dst_reg dst = reg;
921 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
922 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
923 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
924 }
925
926 /* Do sign recovery for 2101010 formats if required. */
927 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
928 if (sign_recovery_shift.file == BAD_FILE) {
929 /* shift constant: <22,22,22,30> */
930 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
931 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
932 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
933 }
934
935 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
936 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
937 }
938
939 /* Apply BGRA swizzle if required. */
940 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
941 src_reg temp = src_reg(reg);
942 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
943 emit(MOV(reg, temp));
944 }
945
946 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
947 /* ES 3.0 has different rules for converting signed normalized
948 * fixed-point numbers than desktop GL.
949 */
950 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
951 /* According to equation 2.2 of the ES 3.0 specification,
952 * signed normalization conversion is done by:
953 *
954 * f = c / (2^(b-1)-1)
955 */
956 if (es3_normalize_factor.file == BAD_FILE) {
957 /* mul constant: 1 / (2^(b-1) - 1) */
958 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
959 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
960 src_reg(1.0f / ((1<<9) - 1))));
961 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
962 src_reg(1.0f / ((1<<1) - 1))));
963 }
964
965 dst_reg dst = reg;
966 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
967 emit(MOV(dst, src_reg(reg_d)));
968 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
969 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
970 } else {
971 /* The following equations are from the OpenGL 3.2 specification:
972 *
973 * 2.1 unsigned normalization
974 * f = c/(2^n-1)
975 *
976 * 2.2 signed normalization
977 * f = (2c+1)/(2^n-1)
978 *
979 * Both of these share a common divisor, which is represented by
980 * "normalize_factor" in the code below.
981 */
982 if (normalize_factor.file == BAD_FILE) {
983 /* 1 / (2^b - 1) for b=<10,10,10,2> */
984 normalize_factor = dst_reg(this, glsl_type::vec4_type);
985 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
986 src_reg(1.0f / ((1<<10) - 1))));
987 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
988 src_reg(1.0f / ((1<<2) - 1))));
989 }
990
991 dst_reg dst = reg;
992 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
993 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
994
995 /* For signed normalization, we want the numerator to be 2c+1. */
996 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
997 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
998 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
999 }
1000
1001 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1002 }
1003 }
1004
1005 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1006 dst_reg dst = reg;
1007 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1008 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1009 }
1010 }
1011 }
1012 }
1013
1014 void
1015 vec4_visitor::visit(ir_variable *ir)
1016 {
1017 dst_reg *reg = NULL;
1018
1019 if (variable_storage(ir))
1020 return;
1021
1022 switch (ir->mode) {
1023 case ir_var_shader_in:
1024 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1025 break;
1026
1027 case ir_var_shader_out:
1028 reg = new(mem_ctx) dst_reg(this, ir->type);
1029
1030 for (int i = 0; i < type_size(ir->type); i++) {
1031 output_reg[ir->location + i] = *reg;
1032 output_reg[ir->location + i].reg_offset = i;
1033 output_reg[ir->location + i].type =
1034 brw_type_for_base_type(ir->type->get_scalar_type());
1035 output_reg_annotation[ir->location + i] = ir->name;
1036 }
1037 break;
1038
1039 case ir_var_auto:
1040 case ir_var_temporary:
1041 reg = new(mem_ctx) dst_reg(this, ir->type);
1042 break;
1043
1044 case ir_var_uniform:
1045 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1046
1047 /* Thanks to the lower_ubo_reference pass, we will see only
1048 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1049 * variables, so no need for them to be in variable_ht.
1050 */
1051 if (ir->uniform_block != -1)
1052 return;
1053
1054 /* Track how big the whole uniform variable is, in case we need to put a
1055 * copy of its data into pull constants for array access.
1056 */
1057 this->uniform_size[this->uniforms] = type_size(ir->type);
1058
1059 if (!strncmp(ir->name, "gl_", 3)) {
1060 setup_builtin_uniform_values(ir);
1061 } else {
1062 setup_uniform_values(ir);
1063 }
1064 break;
1065
1066 case ir_var_system_value:
1067 /* VertexID is stored by the VF as the last vertex element, but
1068 * we don't represent it with a flag in inputs_read, so we call
1069 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1070 */
1071 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1072 prog_data->uses_vertexid = true;
1073
1074 switch (ir->location) {
1075 case SYSTEM_VALUE_VERTEX_ID:
1076 reg->writemask = WRITEMASK_X;
1077 break;
1078 case SYSTEM_VALUE_INSTANCE_ID:
1079 reg->writemask = WRITEMASK_Y;
1080 break;
1081 default:
1082 assert(!"not reached");
1083 break;
1084 }
1085 break;
1086
1087 default:
1088 assert(!"not reached");
1089 }
1090
1091 reg->type = brw_type_for_base_type(ir->type);
1092 hash_table_insert(this->variable_ht, reg, ir);
1093 }
1094
1095 void
1096 vec4_visitor::visit(ir_loop *ir)
1097 {
1098 dst_reg counter;
1099
1100 /* We don't want debugging output to print the whole body of the
1101 * loop as the annotation.
1102 */
1103 this->base_ir = NULL;
1104
1105 if (ir->counter != NULL) {
1106 this->base_ir = ir->counter;
1107 ir->counter->accept(this);
1108 counter = *(variable_storage(ir->counter));
1109
1110 if (ir->from != NULL) {
1111 this->base_ir = ir->from;
1112 ir->from->accept(this);
1113
1114 emit(MOV(counter, this->result));
1115 }
1116 }
1117
1118 emit(BRW_OPCODE_DO);
1119
1120 if (ir->to) {
1121 this->base_ir = ir->to;
1122 ir->to->accept(this);
1123
1124 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1125 brw_conditional_for_comparison(ir->cmp)));
1126
1127 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1128 inst->predicate = BRW_PREDICATE_NORMAL;
1129 }
1130
1131 visit_instructions(&ir->body_instructions);
1132
1133
1134 if (ir->increment) {
1135 this->base_ir = ir->increment;
1136 ir->increment->accept(this);
1137 emit(ADD(counter, src_reg(counter), this->result));
1138 }
1139
1140 emit(BRW_OPCODE_WHILE);
1141 }
1142
1143 void
1144 vec4_visitor::visit(ir_loop_jump *ir)
1145 {
1146 switch (ir->mode) {
1147 case ir_loop_jump::jump_break:
1148 emit(BRW_OPCODE_BREAK);
1149 break;
1150 case ir_loop_jump::jump_continue:
1151 emit(BRW_OPCODE_CONTINUE);
1152 break;
1153 }
1154 }
1155
1156
1157 void
1158 vec4_visitor::visit(ir_function_signature *ir)
1159 {
1160 assert(0);
1161 (void)ir;
1162 }
1163
1164 void
1165 vec4_visitor::visit(ir_function *ir)
1166 {
1167 /* Ignore function bodies other than main() -- we shouldn't see calls to
1168 * them since they should all be inlined.
1169 */
1170 if (strcmp(ir->name, "main") == 0) {
1171 const ir_function_signature *sig;
1172 exec_list empty;
1173
1174 sig = ir->matching_signature(&empty);
1175
1176 assert(sig);
1177
1178 visit_instructions(&sig->body);
1179 }
1180 }
1181
1182 bool
1183 vec4_visitor::try_emit_sat(ir_expression *ir)
1184 {
1185 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1186 if (!sat_src)
1187 return false;
1188
1189 sat_src->accept(this);
1190 src_reg src = this->result;
1191
1192 this->result = src_reg(this, ir->type);
1193 vec4_instruction *inst;
1194 inst = emit(MOV(dst_reg(this->result), src));
1195 inst->saturate = true;
1196
1197 return true;
1198 }
1199
1200 void
1201 vec4_visitor::emit_bool_comparison(unsigned int op,
1202 dst_reg dst, src_reg src0, src_reg src1)
1203 {
1204 /* original gen4 does destination conversion before comparison. */
1205 if (intel->gen < 5)
1206 dst.type = src0.type;
1207
1208 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1209
1210 dst.type = BRW_REGISTER_TYPE_D;
1211 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1212 }
1213
1214 void
1215 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1216 src_reg src0, src_reg src1)
1217 {
1218 vec4_instruction *inst;
1219
1220 if (intel->gen >= 6) {
1221 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1222 inst->conditional_mod = conditionalmod;
1223 } else {
1224 emit(CMP(dst, src0, src1, conditionalmod));
1225
1226 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1227 inst->predicate = BRW_PREDICATE_NORMAL;
1228 }
1229 }
1230
1231 void
1232 vec4_visitor::visit(ir_expression *ir)
1233 {
1234 unsigned int operand;
1235 src_reg op[Elements(ir->operands)];
1236 src_reg result_src;
1237 dst_reg result_dst;
1238 vec4_instruction *inst;
1239
1240 if (try_emit_sat(ir))
1241 return;
1242
1243 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1244 this->result.file = BAD_FILE;
1245 ir->operands[operand]->accept(this);
1246 if (this->result.file == BAD_FILE) {
1247 printf("Failed to get tree for expression operand:\n");
1248 ir->operands[operand]->print();
1249 exit(1);
1250 }
1251 op[operand] = this->result;
1252
1253 /* Matrix expression operands should have been broken down to vector
1254 * operations already.
1255 */
1256 assert(!ir->operands[operand]->type->is_matrix());
1257 }
1258
1259 int vector_elements = ir->operands[0]->type->vector_elements;
1260 if (ir->operands[1]) {
1261 vector_elements = MAX2(vector_elements,
1262 ir->operands[1]->type->vector_elements);
1263 }
1264
1265 this->result.file = BAD_FILE;
1266
1267 /* Storage for our result. Ideally for an assignment we'd be using
1268 * the actual storage for the result here, instead.
1269 */
1270 result_src = src_reg(this, ir->type);
1271 /* convenience for the emit functions below. */
1272 result_dst = dst_reg(result_src);
1273 /* If nothing special happens, this is the result. */
1274 this->result = result_src;
1275 /* Limit writes to the channels that will be used by result_src later.
1276 * This does limit this temp's use as a temporary for multi-instruction
1277 * sequences.
1278 */
1279 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1280
1281 switch (ir->operation) {
1282 case ir_unop_logic_not:
1283 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1284 * ones complement of the whole register, not just bit 0.
1285 */
1286 emit(XOR(result_dst, op[0], src_reg(1)));
1287 break;
1288 case ir_unop_neg:
1289 op[0].negate = !op[0].negate;
1290 this->result = op[0];
1291 break;
1292 case ir_unop_abs:
1293 op[0].abs = true;
1294 op[0].negate = false;
1295 this->result = op[0];
1296 break;
1297
1298 case ir_unop_sign:
1299 emit(MOV(result_dst, src_reg(0.0f)));
1300
1301 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1302 inst = emit(MOV(result_dst, src_reg(1.0f)));
1303 inst->predicate = BRW_PREDICATE_NORMAL;
1304
1305 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1306 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1307 inst->predicate = BRW_PREDICATE_NORMAL;
1308
1309 break;
1310
1311 case ir_unop_rcp:
1312 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1313 break;
1314
1315 case ir_unop_exp2:
1316 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1317 break;
1318 case ir_unop_log2:
1319 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1320 break;
1321 case ir_unop_exp:
1322 case ir_unop_log:
1323 assert(!"not reached: should be handled by ir_explog_to_explog2");
1324 break;
1325 case ir_unop_sin:
1326 case ir_unop_sin_reduced:
1327 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1328 break;
1329 case ir_unop_cos:
1330 case ir_unop_cos_reduced:
1331 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1332 break;
1333
1334 case ir_unop_dFdx:
1335 case ir_unop_dFdy:
1336 assert(!"derivatives not valid in vertex shader");
1337 break;
1338
1339 case ir_unop_noise:
1340 assert(!"not reached: should be handled by lower_noise");
1341 break;
1342
1343 case ir_binop_add:
1344 emit(ADD(result_dst, op[0], op[1]));
1345 break;
1346 case ir_binop_sub:
1347 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1348 break;
1349
1350 case ir_binop_mul:
1351 if (ir->type->is_integer()) {
1352 /* For integer multiplication, the MUL uses the low 16 bits
1353 * of one of the operands (src0 on gen6, src1 on gen7). The
1354 * MACH accumulates in the contribution of the upper 16 bits
1355 * of that operand.
1356 *
1357 * FINISHME: Emit just the MUL if we know an operand is small
1358 * enough.
1359 */
1360 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1361
1362 emit(MUL(acc, op[0], op[1]));
1363 emit(MACH(dst_null_d(), op[0], op[1]));
1364 emit(MOV(result_dst, src_reg(acc)));
1365 } else {
1366 emit(MUL(result_dst, op[0], op[1]));
1367 }
1368 break;
1369 case ir_binop_div:
1370 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1371 assert(ir->type->is_integer());
1372 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1373 break;
1374 case ir_binop_mod:
1375 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1376 assert(ir->type->is_integer());
1377 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1378 break;
1379
1380 case ir_binop_less:
1381 case ir_binop_greater:
1382 case ir_binop_lequal:
1383 case ir_binop_gequal:
1384 case ir_binop_equal:
1385 case ir_binop_nequal: {
1386 emit(CMP(result_dst, op[0], op[1],
1387 brw_conditional_for_comparison(ir->operation)));
1388 emit(AND(result_dst, result_src, src_reg(0x1)));
1389 break;
1390 }
1391
1392 case ir_binop_all_equal:
1393 /* "==" operator producing a scalar boolean. */
1394 if (ir->operands[0]->type->is_vector() ||
1395 ir->operands[1]->type->is_vector()) {
1396 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1397 emit(MOV(result_dst, src_reg(0)));
1398 inst = emit(MOV(result_dst, src_reg(1)));
1399 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1400 } else {
1401 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1402 emit(AND(result_dst, result_src, src_reg(0x1)));
1403 }
1404 break;
1405 case ir_binop_any_nequal:
1406 /* "!=" operator producing a scalar boolean. */
1407 if (ir->operands[0]->type->is_vector() ||
1408 ir->operands[1]->type->is_vector()) {
1409 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1410
1411 emit(MOV(result_dst, src_reg(0)));
1412 inst = emit(MOV(result_dst, src_reg(1)));
1413 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1414 } else {
1415 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1416 emit(AND(result_dst, result_src, src_reg(0x1)));
1417 }
1418 break;
1419
1420 case ir_unop_any:
1421 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1422 emit(MOV(result_dst, src_reg(0)));
1423
1424 inst = emit(MOV(result_dst, src_reg(1)));
1425 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1426 break;
1427
1428 case ir_binop_logic_xor:
1429 emit(XOR(result_dst, op[0], op[1]));
1430 break;
1431
1432 case ir_binop_logic_or:
1433 emit(OR(result_dst, op[0], op[1]));
1434 break;
1435
1436 case ir_binop_logic_and:
1437 emit(AND(result_dst, op[0], op[1]));
1438 break;
1439
1440 case ir_binop_dot:
1441 assert(ir->operands[0]->type->is_vector());
1442 assert(ir->operands[0]->type == ir->operands[1]->type);
1443 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1444 break;
1445
1446 case ir_unop_sqrt:
1447 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1448 break;
1449 case ir_unop_rsq:
1450 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1451 break;
1452
1453 case ir_unop_bitcast_i2f:
1454 case ir_unop_bitcast_u2f:
1455 this->result = op[0];
1456 this->result.type = BRW_REGISTER_TYPE_F;
1457 break;
1458
1459 case ir_unop_bitcast_f2i:
1460 this->result = op[0];
1461 this->result.type = BRW_REGISTER_TYPE_D;
1462 break;
1463
1464 case ir_unop_bitcast_f2u:
1465 this->result = op[0];
1466 this->result.type = BRW_REGISTER_TYPE_UD;
1467 break;
1468
1469 case ir_unop_i2f:
1470 case ir_unop_i2u:
1471 case ir_unop_u2i:
1472 case ir_unop_u2f:
1473 case ir_unop_b2f:
1474 case ir_unop_b2i:
1475 case ir_unop_f2i:
1476 case ir_unop_f2u:
1477 emit(MOV(result_dst, op[0]));
1478 break;
1479 case ir_unop_f2b:
1480 case ir_unop_i2b: {
1481 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1482 emit(AND(result_dst, result_src, src_reg(1)));
1483 break;
1484 }
1485
1486 case ir_unop_trunc:
1487 emit(RNDZ(result_dst, op[0]));
1488 break;
1489 case ir_unop_ceil:
1490 op[0].negate = !op[0].negate;
1491 inst = emit(RNDD(result_dst, op[0]));
1492 this->result.negate = true;
1493 break;
1494 case ir_unop_floor:
1495 inst = emit(RNDD(result_dst, op[0]));
1496 break;
1497 case ir_unop_fract:
1498 inst = emit(FRC(result_dst, op[0]));
1499 break;
1500 case ir_unop_round_even:
1501 emit(RNDE(result_dst, op[0]));
1502 break;
1503
1504 case ir_binop_min:
1505 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1506 break;
1507 case ir_binop_max:
1508 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1509 break;
1510
1511 case ir_binop_pow:
1512 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1513 break;
1514
1515 case ir_unop_bit_not:
1516 inst = emit(NOT(result_dst, op[0]));
1517 break;
1518 case ir_binop_bit_and:
1519 inst = emit(AND(result_dst, op[0], op[1]));
1520 break;
1521 case ir_binop_bit_xor:
1522 inst = emit(XOR(result_dst, op[0], op[1]));
1523 break;
1524 case ir_binop_bit_or:
1525 inst = emit(OR(result_dst, op[0], op[1]));
1526 break;
1527
1528 case ir_binop_lshift:
1529 inst = emit(SHL(result_dst, op[0], op[1]));
1530 break;
1531
1532 case ir_binop_rshift:
1533 if (ir->type->base_type == GLSL_TYPE_INT)
1534 inst = emit(ASR(result_dst, op[0], op[1]));
1535 else
1536 inst = emit(SHR(result_dst, op[0], op[1]));
1537 break;
1538
1539 case ir_binop_ubo_load: {
1540 ir_constant *uniform_block = ir->operands[0]->as_constant();
1541 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1542 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1543 src_reg offset = op[1];
1544
1545 /* Now, load the vector from that offset. */
1546 assert(ir->type->is_vector() || ir->type->is_scalar());
1547
1548 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1549 packed_consts.type = result.type;
1550 src_reg surf_index =
1551 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1552 if (const_offset_ir) {
1553 offset = src_reg(const_offset / 16);
1554 } else {
1555 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1556 }
1557
1558 vec4_instruction *pull =
1559 emit(new(mem_ctx) vec4_instruction(this,
1560 VS_OPCODE_PULL_CONSTANT_LOAD,
1561 dst_reg(packed_consts),
1562 surf_index,
1563 offset));
1564 pull->base_mrf = 14;
1565 pull->mlen = 1;
1566
1567 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1568 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1569 const_offset % 16 / 4,
1570 const_offset % 16 / 4,
1571 const_offset % 16 / 4);
1572
1573 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1574 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1575 emit(CMP(result_dst, packed_consts, src_reg(0u),
1576 BRW_CONDITIONAL_NZ));
1577 emit(AND(result_dst, result, src_reg(0x1)));
1578 } else {
1579 emit(MOV(result_dst, packed_consts));
1580 }
1581 break;
1582 }
1583
1584 case ir_quadop_vector:
1585 assert(!"not reached: should be handled by lower_quadop_vector");
1586 break;
1587
1588 case ir_unop_pack_half_2x16:
1589 emit_pack_half_2x16(result_dst, op[0]);
1590 break;
1591 case ir_unop_unpack_half_2x16:
1592 emit_unpack_half_2x16(result_dst, op[0]);
1593 break;
1594 case ir_unop_pack_snorm_2x16:
1595 case ir_unop_pack_unorm_2x16:
1596 case ir_unop_unpack_snorm_2x16:
1597 case ir_unop_unpack_unorm_2x16:
1598 assert(!"not reached: should be handled by lower_packing_builtins");
1599 break;
1600 case ir_unop_unpack_half_2x16_split_x:
1601 case ir_unop_unpack_half_2x16_split_y:
1602 case ir_binop_pack_half_2x16_split:
1603 assert(!"not reached: should not occur in vertex shader");
1604 break;
1605 }
1606 }
1607
1608
1609 void
1610 vec4_visitor::visit(ir_swizzle *ir)
1611 {
1612 src_reg src;
1613 int i = 0;
1614 int swizzle[4];
1615
1616 /* Note that this is only swizzles in expressions, not those on the left
1617 * hand side of an assignment, which do write masking. See ir_assignment
1618 * for that.
1619 */
1620
1621 ir->val->accept(this);
1622 src = this->result;
1623 assert(src.file != BAD_FILE);
1624
1625 for (i = 0; i < ir->type->vector_elements; i++) {
1626 switch (i) {
1627 case 0:
1628 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1629 break;
1630 case 1:
1631 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1632 break;
1633 case 2:
1634 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1635 break;
1636 case 3:
1637 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1638 break;
1639 }
1640 }
1641 for (; i < 4; i++) {
1642 /* Replicate the last channel out. */
1643 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1644 }
1645
1646 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1647
1648 this->result = src;
1649 }
1650
1651 void
1652 vec4_visitor::visit(ir_dereference_variable *ir)
1653 {
1654 const struct glsl_type *type = ir->type;
1655 dst_reg *reg = variable_storage(ir->var);
1656
1657 if (!reg) {
1658 fail("Failed to find variable storage for %s\n", ir->var->name);
1659 this->result = src_reg(brw_null_reg());
1660 return;
1661 }
1662
1663 this->result = src_reg(*reg);
1664
1665 /* System values get their swizzle from the dst_reg writemask */
1666 if (ir->var->mode == ir_var_system_value)
1667 return;
1668
1669 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1670 this->result.swizzle = swizzle_for_size(type->vector_elements);
1671 }
1672
1673 void
1674 vec4_visitor::visit(ir_dereference_array *ir)
1675 {
1676 ir_constant *constant_index;
1677 src_reg src;
1678 int element_size = type_size(ir->type);
1679
1680 constant_index = ir->array_index->constant_expression_value();
1681
1682 ir->array->accept(this);
1683 src = this->result;
1684
1685 if (constant_index) {
1686 src.reg_offset += constant_index->value.i[0] * element_size;
1687 } else {
1688 /* Variable index array dereference. It eats the "vec4" of the
1689 * base of the array and an index that offsets the Mesa register
1690 * index.
1691 */
1692 ir->array_index->accept(this);
1693
1694 src_reg index_reg;
1695
1696 if (element_size == 1) {
1697 index_reg = this->result;
1698 } else {
1699 index_reg = src_reg(this, glsl_type::int_type);
1700
1701 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1702 }
1703
1704 if (src.reladdr) {
1705 src_reg temp = src_reg(this, glsl_type::int_type);
1706
1707 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1708
1709 index_reg = temp;
1710 }
1711
1712 src.reladdr = ralloc(mem_ctx, src_reg);
1713 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1714 }
1715
1716 /* If the type is smaller than a vec4, replicate the last channel out. */
1717 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1718 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1719 else
1720 src.swizzle = BRW_SWIZZLE_NOOP;
1721 src.type = brw_type_for_base_type(ir->type);
1722
1723 this->result = src;
1724 }
1725
1726 void
1727 vec4_visitor::visit(ir_dereference_record *ir)
1728 {
1729 unsigned int i;
1730 const glsl_type *struct_type = ir->record->type;
1731 int offset = 0;
1732
1733 ir->record->accept(this);
1734
1735 for (i = 0; i < struct_type->length; i++) {
1736 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1737 break;
1738 offset += type_size(struct_type->fields.structure[i].type);
1739 }
1740
1741 /* If the type is smaller than a vec4, replicate the last channel out. */
1742 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1743 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1744 else
1745 this->result.swizzle = BRW_SWIZZLE_NOOP;
1746 this->result.type = brw_type_for_base_type(ir->type);
1747
1748 this->result.reg_offset += offset;
1749 }
1750
1751 /**
1752 * We want to be careful in assignment setup to hit the actual storage
1753 * instead of potentially using a temporary like we might with the
1754 * ir_dereference handler.
1755 */
1756 static dst_reg
1757 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1758 {
1759 /* The LHS must be a dereference. If the LHS is a variable indexed array
1760 * access of a vector, it must be separated into a series conditional moves
1761 * before reaching this point (see ir_vec_index_to_cond_assign).
1762 */
1763 assert(ir->as_dereference());
1764 ir_dereference_array *deref_array = ir->as_dereference_array();
1765 if (deref_array) {
1766 assert(!deref_array->array->type->is_vector());
1767 }
1768
1769 /* Use the rvalue deref handler for the most part. We'll ignore
1770 * swizzles in it and write swizzles using writemask, though.
1771 */
1772 ir->accept(v);
1773 return dst_reg(v->result);
1774 }
1775
1776 void
1777 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1778 const struct glsl_type *type, uint32_t predicate)
1779 {
1780 if (type->base_type == GLSL_TYPE_STRUCT) {
1781 for (unsigned int i = 0; i < type->length; i++) {
1782 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1783 }
1784 return;
1785 }
1786
1787 if (type->is_array()) {
1788 for (unsigned int i = 0; i < type->length; i++) {
1789 emit_block_move(dst, src, type->fields.array, predicate);
1790 }
1791 return;
1792 }
1793
1794 if (type->is_matrix()) {
1795 const struct glsl_type *vec_type;
1796
1797 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1798 type->vector_elements, 1);
1799
1800 for (int i = 0; i < type->matrix_columns; i++) {
1801 emit_block_move(dst, src, vec_type, predicate);
1802 }
1803 return;
1804 }
1805
1806 assert(type->is_scalar() || type->is_vector());
1807
1808 dst->type = brw_type_for_base_type(type);
1809 src->type = dst->type;
1810
1811 dst->writemask = (1 << type->vector_elements) - 1;
1812
1813 src->swizzle = swizzle_for_size(type->vector_elements);
1814
1815 vec4_instruction *inst = emit(MOV(*dst, *src));
1816 inst->predicate = predicate;
1817
1818 dst->reg_offset++;
1819 src->reg_offset++;
1820 }
1821
1822
1823 /* If the RHS processing resulted in an instruction generating a
1824 * temporary value, and it would be easy to rewrite the instruction to
1825 * generate its result right into the LHS instead, do so. This ends
1826 * up reliably removing instructions where it can be tricky to do so
1827 * later without real UD chain information.
1828 */
1829 bool
1830 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1831 dst_reg dst,
1832 src_reg src,
1833 vec4_instruction *pre_rhs_inst,
1834 vec4_instruction *last_rhs_inst)
1835 {
1836 /* This could be supported, but it would take more smarts. */
1837 if (ir->condition)
1838 return false;
1839
1840 if (pre_rhs_inst == last_rhs_inst)
1841 return false; /* No instructions generated to work with. */
1842
1843 /* Make sure the last instruction generated our source reg. */
1844 if (src.file != GRF ||
1845 src.file != last_rhs_inst->dst.file ||
1846 src.reg != last_rhs_inst->dst.reg ||
1847 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1848 src.reladdr ||
1849 src.abs ||
1850 src.negate ||
1851 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1852 return false;
1853
1854 /* Check that that last instruction fully initialized the channels
1855 * we want to use, in the order we want to use them. We could
1856 * potentially reswizzle the operands of many instructions so that
1857 * we could handle out of order channels, but don't yet.
1858 */
1859
1860 for (unsigned i = 0; i < 4; i++) {
1861 if (dst.writemask & (1 << i)) {
1862 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1863 return false;
1864
1865 if (BRW_GET_SWZ(src.swizzle, i) != i)
1866 return false;
1867 }
1868 }
1869
1870 /* Success! Rewrite the instruction. */
1871 last_rhs_inst->dst.file = dst.file;
1872 last_rhs_inst->dst.reg = dst.reg;
1873 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1874 last_rhs_inst->dst.reladdr = dst.reladdr;
1875 last_rhs_inst->dst.writemask &= dst.writemask;
1876
1877 return true;
1878 }
1879
1880 void
1881 vec4_visitor::visit(ir_assignment *ir)
1882 {
1883 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1884 uint32_t predicate = BRW_PREDICATE_NONE;
1885
1886 if (!ir->lhs->type->is_scalar() &&
1887 !ir->lhs->type->is_vector()) {
1888 ir->rhs->accept(this);
1889 src_reg src = this->result;
1890
1891 if (ir->condition) {
1892 emit_bool_to_cond_code(ir->condition, &predicate);
1893 }
1894
1895 /* emit_block_move doesn't account for swizzles in the source register.
1896 * This should be ok, since the source register is a structure or an
1897 * array, and those can't be swizzled. But double-check to be sure.
1898 */
1899 assert(src.swizzle ==
1900 (ir->rhs->type->is_matrix()
1901 ? swizzle_for_size(ir->rhs->type->vector_elements)
1902 : BRW_SWIZZLE_NOOP));
1903
1904 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1905 return;
1906 }
1907
1908 /* Now we're down to just a scalar/vector with writemasks. */
1909 int i;
1910
1911 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1912 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1913
1914 ir->rhs->accept(this);
1915
1916 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1917
1918 src_reg src = this->result;
1919
1920 int swizzles[4];
1921 int first_enabled_chan = 0;
1922 int src_chan = 0;
1923
1924 assert(ir->lhs->type->is_vector() ||
1925 ir->lhs->type->is_scalar());
1926 dst.writemask = ir->write_mask;
1927
1928 for (int i = 0; i < 4; i++) {
1929 if (dst.writemask & (1 << i)) {
1930 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1931 break;
1932 }
1933 }
1934
1935 /* Swizzle a small RHS vector into the channels being written.
1936 *
1937 * glsl ir treats write_mask as dictating how many channels are
1938 * present on the RHS while in our instructions we need to make
1939 * those channels appear in the slots of the vec4 they're written to.
1940 */
1941 for (int i = 0; i < 4; i++) {
1942 if (dst.writemask & (1 << i))
1943 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1944 else
1945 swizzles[i] = first_enabled_chan;
1946 }
1947 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1948 swizzles[2], swizzles[3]);
1949
1950 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1951 return;
1952 }
1953
1954 if (ir->condition) {
1955 emit_bool_to_cond_code(ir->condition, &predicate);
1956 }
1957
1958 for (i = 0; i < type_size(ir->lhs->type); i++) {
1959 vec4_instruction *inst = emit(MOV(dst, src));
1960 inst->predicate = predicate;
1961
1962 dst.reg_offset++;
1963 src.reg_offset++;
1964 }
1965 }
1966
1967 void
1968 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1969 {
1970 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1971 foreach_list(node, &ir->components) {
1972 ir_constant *field_value = (ir_constant *)node;
1973
1974 emit_constant_values(dst, field_value);
1975 }
1976 return;
1977 }
1978
1979 if (ir->type->is_array()) {
1980 for (unsigned int i = 0; i < ir->type->length; i++) {
1981 emit_constant_values(dst, ir->array_elements[i]);
1982 }
1983 return;
1984 }
1985
1986 if (ir->type->is_matrix()) {
1987 for (int i = 0; i < ir->type->matrix_columns; i++) {
1988 float *vec = &ir->value.f[i * ir->type->vector_elements];
1989
1990 for (int j = 0; j < ir->type->vector_elements; j++) {
1991 dst->writemask = 1 << j;
1992 dst->type = BRW_REGISTER_TYPE_F;
1993
1994 emit(MOV(*dst, src_reg(vec[j])));
1995 }
1996 dst->reg_offset++;
1997 }
1998 return;
1999 }
2000
2001 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2002
2003 for (int i = 0; i < ir->type->vector_elements; i++) {
2004 if (!(remaining_writemask & (1 << i)))
2005 continue;
2006
2007 dst->writemask = 1 << i;
2008 dst->type = brw_type_for_base_type(ir->type);
2009
2010 /* Find other components that match the one we're about to
2011 * write. Emits fewer instructions for things like vec4(0.5,
2012 * 1.5, 1.5, 1.5).
2013 */
2014 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2015 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2016 if (ir->value.b[i] == ir->value.b[j])
2017 dst->writemask |= (1 << j);
2018 } else {
2019 /* u, i, and f storage all line up, so no need for a
2020 * switch case for comparing each type.
2021 */
2022 if (ir->value.u[i] == ir->value.u[j])
2023 dst->writemask |= (1 << j);
2024 }
2025 }
2026
2027 switch (ir->type->base_type) {
2028 case GLSL_TYPE_FLOAT:
2029 emit(MOV(*dst, src_reg(ir->value.f[i])));
2030 break;
2031 case GLSL_TYPE_INT:
2032 emit(MOV(*dst, src_reg(ir->value.i[i])));
2033 break;
2034 case GLSL_TYPE_UINT:
2035 emit(MOV(*dst, src_reg(ir->value.u[i])));
2036 break;
2037 case GLSL_TYPE_BOOL:
2038 emit(MOV(*dst, src_reg(ir->value.b[i])));
2039 break;
2040 default:
2041 assert(!"Non-float/uint/int/bool constant");
2042 break;
2043 }
2044
2045 remaining_writemask &= ~dst->writemask;
2046 }
2047 dst->reg_offset++;
2048 }
2049
2050 void
2051 vec4_visitor::visit(ir_constant *ir)
2052 {
2053 dst_reg dst = dst_reg(this, ir->type);
2054 this->result = src_reg(dst);
2055
2056 emit_constant_values(&dst, ir);
2057 }
2058
2059 void
2060 vec4_visitor::visit(ir_call *ir)
2061 {
2062 assert(!"not reached");
2063 }
2064
2065 void
2066 vec4_visitor::visit(ir_texture *ir)
2067 {
2068 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
2069
2070 /* Should be lowered by do_lower_texture_projection */
2071 assert(!ir->projector);
2072
2073 /* Generate code to compute all the subexpression trees. This has to be
2074 * done before loading any values into MRFs for the sampler message since
2075 * generating these values may involve SEND messages that need the MRFs.
2076 */
2077 src_reg coordinate;
2078 if (ir->coordinate) {
2079 ir->coordinate->accept(this);
2080 coordinate = this->result;
2081 }
2082
2083 src_reg shadow_comparitor;
2084 if (ir->shadow_comparitor) {
2085 ir->shadow_comparitor->accept(this);
2086 shadow_comparitor = this->result;
2087 }
2088
2089 const glsl_type *lod_type;
2090 src_reg lod, dPdx, dPdy;
2091 switch (ir->op) {
2092 case ir_tex:
2093 lod = src_reg(0.0f);
2094 lod_type = glsl_type::float_type;
2095 break;
2096 case ir_txf:
2097 case ir_txl:
2098 case ir_txs:
2099 ir->lod_info.lod->accept(this);
2100 lod = this->result;
2101 lod_type = ir->lod_info.lod->type;
2102 break;
2103 case ir_txd:
2104 ir->lod_info.grad.dPdx->accept(this);
2105 dPdx = this->result;
2106
2107 ir->lod_info.grad.dPdy->accept(this);
2108 dPdy = this->result;
2109
2110 lod_type = ir->lod_info.grad.dPdx->type;
2111 break;
2112 case ir_txb:
2113 break;
2114 }
2115
2116 vec4_instruction *inst = NULL;
2117 switch (ir->op) {
2118 case ir_tex:
2119 case ir_txl:
2120 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2121 break;
2122 case ir_txd:
2123 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2124 break;
2125 case ir_txf:
2126 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2127 break;
2128 case ir_txs:
2129 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2130 break;
2131 case ir_txb:
2132 assert(!"TXB is not valid for vertex shaders.");
2133 }
2134
2135 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2136
2137 /* Texel offsets go in the message header; Gen4 also requires headers. */
2138 inst->header_present = use_texture_offset || intel->gen < 5;
2139 inst->base_mrf = 2;
2140 inst->mlen = inst->header_present + 1; /* always at least one */
2141 inst->sampler = sampler;
2142 inst->dst = dst_reg(this, ir->type);
2143 inst->dst.writemask = WRITEMASK_XYZW;
2144 inst->shadow_compare = ir->shadow_comparitor != NULL;
2145
2146 if (use_texture_offset)
2147 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2148
2149 /* MRF for the first parameter */
2150 int param_base = inst->base_mrf + inst->header_present;
2151
2152 if (ir->op == ir_txs) {
2153 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2154 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2155 } else {
2156 int i, coord_mask = 0, zero_mask = 0;
2157 /* Load the coordinate */
2158 /* FINISHME: gl_clamp_mask and saturate */
2159 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2160 coord_mask |= (1 << i);
2161 for (; i < 4; i++)
2162 zero_mask |= (1 << i);
2163
2164 if (ir->offset && ir->op == ir_txf) {
2165 /* It appears that the ld instruction used for txf does its
2166 * address bounds check before adding in the offset. To work
2167 * around this, just add the integer offset to the integer
2168 * texel coordinate, and don't put the offset in the header.
2169 */
2170 ir_constant *offset = ir->offset->as_constant();
2171 assert(offset);
2172
2173 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2174 src_reg src = coordinate;
2175 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2176 BRW_GET_SWZ(src.swizzle, j),
2177 BRW_GET_SWZ(src.swizzle, j),
2178 BRW_GET_SWZ(src.swizzle, j));
2179 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2180 src, offset->value.i[j]));
2181 }
2182 } else {
2183 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2184 coordinate));
2185 }
2186 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2187 src_reg(0)));
2188 /* Load the shadow comparitor */
2189 if (ir->shadow_comparitor) {
2190 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2191 WRITEMASK_X),
2192 shadow_comparitor));
2193 inst->mlen++;
2194 }
2195
2196 /* Load the LOD info */
2197 if (ir->op == ir_tex || ir->op == ir_txl) {
2198 int mrf, writemask;
2199 if (intel->gen >= 5) {
2200 mrf = param_base + 1;
2201 if (ir->shadow_comparitor) {
2202 writemask = WRITEMASK_Y;
2203 /* mlen already incremented */
2204 } else {
2205 writemask = WRITEMASK_X;
2206 inst->mlen++;
2207 }
2208 } else /* intel->gen == 4 */ {
2209 mrf = param_base;
2210 writemask = WRITEMASK_Z;
2211 }
2212 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2213 } else if (ir->op == ir_txf) {
2214 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2215 lod));
2216 } else if (ir->op == ir_txd) {
2217 const glsl_type *type = lod_type;
2218
2219 if (intel->gen >= 5) {
2220 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2221 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2222 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2223 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2224 inst->mlen++;
2225
2226 if (ir->type->vector_elements == 3) {
2227 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2228 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2229 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2230 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2231 inst->mlen++;
2232 }
2233 } else /* intel->gen == 4 */ {
2234 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2235 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2236 inst->mlen += 2;
2237 }
2238 }
2239 }
2240
2241 emit(inst);
2242
2243 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2244 * spec requires layers.
2245 */
2246 if (ir->op == ir_txs) {
2247 glsl_type const *type = ir->sampler->type;
2248 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2249 type->sampler_array) {
2250 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2251 with_writemask(inst->dst, WRITEMASK_Z),
2252 src_reg(inst->dst), src_reg(6));
2253 }
2254 }
2255
2256 swizzle_result(ir, src_reg(inst->dst), sampler);
2257 }
2258
2259 void
2260 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2261 {
2262 int s = c->key.tex.swizzles[sampler];
2263
2264 this->result = src_reg(this, ir->type);
2265 dst_reg swizzled_result(this->result);
2266
2267 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2268 || s == SWIZZLE_NOOP) {
2269 emit(MOV(swizzled_result, orig_val));
2270 return;
2271 }
2272
2273 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2274 int swizzle[4];
2275
2276 for (int i = 0; i < 4; i++) {
2277 switch (GET_SWZ(s, i)) {
2278 case SWIZZLE_ZERO:
2279 zero_mask |= (1 << i);
2280 break;
2281 case SWIZZLE_ONE:
2282 one_mask |= (1 << i);
2283 break;
2284 default:
2285 copy_mask |= (1 << i);
2286 swizzle[i] = GET_SWZ(s, i);
2287 break;
2288 }
2289 }
2290
2291 if (copy_mask) {
2292 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2293 swizzled_result.writemask = copy_mask;
2294 emit(MOV(swizzled_result, orig_val));
2295 }
2296
2297 if (zero_mask) {
2298 swizzled_result.writemask = zero_mask;
2299 emit(MOV(swizzled_result, src_reg(0.0f)));
2300 }
2301
2302 if (one_mask) {
2303 swizzled_result.writemask = one_mask;
2304 emit(MOV(swizzled_result, src_reg(1.0f)));
2305 }
2306 }
2307
2308 void
2309 vec4_visitor::visit(ir_return *ir)
2310 {
2311 assert(!"not reached");
2312 }
2313
2314 void
2315 vec4_visitor::visit(ir_discard *ir)
2316 {
2317 assert(!"not reached");
2318 }
2319
2320 void
2321 vec4_visitor::visit(ir_if *ir)
2322 {
2323 /* Don't point the annotation at the if statement, because then it plus
2324 * the then and else blocks get printed.
2325 */
2326 this->base_ir = ir->condition;
2327
2328 if (intel->gen == 6) {
2329 emit_if_gen6(ir);
2330 } else {
2331 uint32_t predicate;
2332 emit_bool_to_cond_code(ir->condition, &predicate);
2333 emit(IF(predicate));
2334 }
2335
2336 visit_instructions(&ir->then_instructions);
2337
2338 if (!ir->else_instructions.is_empty()) {
2339 this->base_ir = ir->condition;
2340 emit(BRW_OPCODE_ELSE);
2341
2342 visit_instructions(&ir->else_instructions);
2343 }
2344
2345 this->base_ir = ir->condition;
2346 emit(BRW_OPCODE_ENDIF);
2347 }
2348
2349 void
2350 vec4_visitor::emit_ndc_computation()
2351 {
2352 /* Get the position */
2353 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2354
2355 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2356 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2357 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2358
2359 current_annotation = "NDC";
2360 dst_reg ndc_w = ndc;
2361 ndc_w.writemask = WRITEMASK_W;
2362 src_reg pos_w = pos;
2363 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2364 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2365
2366 dst_reg ndc_xyz = ndc;
2367 ndc_xyz.writemask = WRITEMASK_XYZ;
2368
2369 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2370 }
2371
2372 void
2373 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2374 {
2375 if (intel->gen < 6 &&
2376 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2377 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2378 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2379 dst_reg header1_w = header1;
2380 header1_w.writemask = WRITEMASK_W;
2381 GLuint i;
2382
2383 emit(MOV(header1, 0u));
2384
2385 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2386 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2387
2388 current_annotation = "Point size";
2389 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2390 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2391 }
2392
2393 current_annotation = "Clipping flags";
2394 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2395 vec4_instruction *inst;
2396
2397 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2398 src_reg(this->userplane[i])));
2399 inst->conditional_mod = BRW_CONDITIONAL_L;
2400
2401 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2402 inst->predicate = BRW_PREDICATE_NORMAL;
2403 }
2404
2405 /* i965 clipping workaround:
2406 * 1) Test for -ve rhw
2407 * 2) If set,
2408 * set ndc = (0,0,0,0)
2409 * set ucp[6] = 1
2410 *
2411 * Later, clipping will detect ucp[6] and ensure the primitive is
2412 * clipped against all fixed planes.
2413 */
2414 if (brw->has_negative_rhw_bug) {
2415 #if 0
2416 /* FINISHME */
2417 brw_CMP(p,
2418 vec8(brw_null_reg()),
2419 BRW_CONDITIONAL_L,
2420 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2421 brw_imm_f(0));
2422
2423 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2424 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2425 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2426 #endif
2427 }
2428
2429 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2430 } else if (intel->gen < 6) {
2431 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2432 } else {
2433 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2434 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2435 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2436 src_reg(output_reg[VERT_RESULT_PSIZ])));
2437 }
2438 }
2439 }
2440
2441 void
2442 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2443 {
2444 if (intel->gen < 6) {
2445 /* Clip distance slots are set aside in gen5, but they are not used. It
2446 * is not clear whether we actually need to set aside space for them,
2447 * but the performance cost is negligible.
2448 */
2449 return;
2450 }
2451
2452 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2453 *
2454 * "If a linked set of shaders forming the vertex stage contains no
2455 * static write to gl_ClipVertex or gl_ClipDistance, but the
2456 * application has requested clipping against user clip planes through
2457 * the API, then the coordinate written to gl_Position is used for
2458 * comparison against the user clip planes."
2459 *
2460 * This function is only called if the shader didn't write to
2461 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2462 * if the user wrote to it; otherwise we use gl_Position.
2463 */
2464 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2465 if (!(c->prog_data.outputs_written
2466 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2467 clip_vertex = VERT_RESULT_HPOS;
2468 }
2469
2470 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2471 ++i) {
2472 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2473 src_reg(output_reg[clip_vertex]),
2474 src_reg(this->userplane[i + offset])));
2475 }
2476 }
2477
2478 void
2479 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2480 {
2481 assert (vert_result < VERT_RESULT_MAX);
2482 reg.type = output_reg[vert_result].type;
2483 current_annotation = output_reg_annotation[vert_result];
2484 /* Copy the register, saturating if necessary */
2485 vec4_instruction *inst = emit(MOV(reg,
2486 src_reg(output_reg[vert_result])));
2487 if ((vert_result == VERT_RESULT_COL0 ||
2488 vert_result == VERT_RESULT_COL1 ||
2489 vert_result == VERT_RESULT_BFC0 ||
2490 vert_result == VERT_RESULT_BFC1) &&
2491 c->key.clamp_vertex_color) {
2492 inst->saturate = true;
2493 }
2494 }
2495
2496 void
2497 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2498 {
2499 struct brw_reg hw_reg = brw_message_reg(mrf);
2500 dst_reg reg = dst_reg(MRF, mrf);
2501 reg.type = BRW_REGISTER_TYPE_F;
2502
2503 switch (vert_result) {
2504 case VERT_RESULT_PSIZ:
2505 /* PSIZ is always in slot 0, and is coupled with other flags. */
2506 current_annotation = "indices, point width, clip flags";
2507 emit_psiz_and_flags(hw_reg);
2508 break;
2509 case BRW_VERT_RESULT_NDC:
2510 current_annotation = "NDC";
2511 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2512 break;
2513 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2514 case VERT_RESULT_HPOS:
2515 current_annotation = "gl_Position";
2516 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2517 break;
2518 case VERT_RESULT_CLIP_DIST0:
2519 case VERT_RESULT_CLIP_DIST1:
2520 if (this->c->key.uses_clip_distance) {
2521 emit_generic_urb_slot(reg, vert_result);
2522 } else {
2523 current_annotation = "user clip distances";
2524 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2525 }
2526 break;
2527 case VERT_RESULT_EDGE:
2528 /* This is present when doing unfilled polygons. We're supposed to copy
2529 * the edge flag from the user-provided vertex array
2530 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2531 * of that attribute (starts as 1.0f). This is then used in clipping to
2532 * determine which edges should be drawn as wireframe.
2533 */
2534 current_annotation = "edge flag";
2535 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2536 glsl_type::float_type, WRITEMASK_XYZW))));
2537 break;
2538 case BRW_VERT_RESULT_PAD:
2539 /* No need to write to this slot */
2540 break;
2541 default:
2542 emit_generic_urb_slot(reg, vert_result);
2543 break;
2544 }
2545 }
2546
2547 static int
2548 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2549 {
2550 struct intel_context *intel = &brw->intel;
2551
2552 if (intel->gen >= 6) {
2553 /* URB data written (does not include the message header reg) must
2554 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2555 * section 5.4.3.2.2: URB_INTERLEAVED.
2556 *
2557 * URB entries are allocated on a multiple of 1024 bits, so an
2558 * extra 128 bits written here to make the end align to 256 is
2559 * no problem.
2560 */
2561 if ((mlen % 2) != 1)
2562 mlen++;
2563 }
2564
2565 return mlen;
2566 }
2567
2568 /**
2569 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2570 * complete the VS thread.
2571 *
2572 * The VUE layout is documented in Volume 2a.
2573 */
2574 void
2575 vec4_visitor::emit_urb_writes()
2576 {
2577 /* MRF 0 is reserved for the debugger, so start with message header
2578 * in MRF 1.
2579 */
2580 int base_mrf = 1;
2581 int mrf = base_mrf;
2582 /* In the process of generating our URB write message contents, we
2583 * may need to unspill a register or load from an array. Those
2584 * reads would use MRFs 14-15.
2585 */
2586 int max_usable_mrf = 13;
2587
2588 /* The following assertion verifies that max_usable_mrf causes an
2589 * even-numbered amount of URB write data, which will meet gen6's
2590 * requirements for length alignment.
2591 */
2592 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2593
2594 /* First mrf is the g0-based message header containing URB handles and such,
2595 * which is implied in VS_OPCODE_URB_WRITE.
2596 */
2597 mrf++;
2598
2599 if (intel->gen < 6) {
2600 emit_ndc_computation();
2601 }
2602
2603 /* Set up the VUE data for the first URB write */
2604 int slot;
2605 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2606 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2607
2608 /* If this was max_usable_mrf, we can't fit anything more into this URB
2609 * WRITE.
2610 */
2611 if (mrf > max_usable_mrf) {
2612 slot++;
2613 break;
2614 }
2615 }
2616
2617 current_annotation = "URB write";
2618 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2619 inst->base_mrf = base_mrf;
2620 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2621 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2622
2623 /* Optional second URB write */
2624 if (!inst->eot) {
2625 mrf = base_mrf + 1;
2626
2627 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2628 assert(mrf < max_usable_mrf);
2629
2630 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2631 }
2632
2633 current_annotation = "URB write";
2634 inst = emit(VS_OPCODE_URB_WRITE);
2635 inst->base_mrf = base_mrf;
2636 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2637 inst->eot = true;
2638 /* URB destination offset. In the previous write, we got MRFs
2639 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2640 * URB row increments, and each of our MRFs is half of one of
2641 * those, since we're doing interleaved writes.
2642 */
2643 inst->offset = (max_usable_mrf - base_mrf) / 2;
2644 }
2645 }
2646
2647 src_reg
2648 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2649 src_reg *reladdr, int reg_offset)
2650 {
2651 /* Because we store the values to scratch interleaved like our
2652 * vertex data, we need to scale the vec4 index by 2.
2653 */
2654 int message_header_scale = 2;
2655
2656 /* Pre-gen6, the message header uses byte offsets instead of vec4
2657 * (16-byte) offset units.
2658 */
2659 if (intel->gen < 6)
2660 message_header_scale *= 16;
2661
2662 if (reladdr) {
2663 src_reg index = src_reg(this, glsl_type::int_type);
2664
2665 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2666 emit_before(inst, MUL(dst_reg(index),
2667 index, src_reg(message_header_scale)));
2668
2669 return index;
2670 } else {
2671 return src_reg(reg_offset * message_header_scale);
2672 }
2673 }
2674
2675 src_reg
2676 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2677 src_reg *reladdr, int reg_offset)
2678 {
2679 if (reladdr) {
2680 src_reg index = src_reg(this, glsl_type::int_type);
2681
2682 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2683
2684 /* Pre-gen6, the message header uses byte offsets instead of vec4
2685 * (16-byte) offset units.
2686 */
2687 if (intel->gen < 6) {
2688 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2689 }
2690
2691 return index;
2692 } else {
2693 int message_header_scale = intel->gen < 6 ? 16 : 1;
2694 return src_reg(reg_offset * message_header_scale);
2695 }
2696 }
2697
2698 /**
2699 * Emits an instruction before @inst to load the value named by @orig_src
2700 * from scratch space at @base_offset to @temp.
2701 *
2702 * @base_offset is measured in 32-byte units (the size of a register).
2703 */
2704 void
2705 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2706 dst_reg temp, src_reg orig_src,
2707 int base_offset)
2708 {
2709 int reg_offset = base_offset + orig_src.reg_offset;
2710 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2711
2712 emit_before(inst, SCRATCH_READ(temp, index));
2713 }
2714
2715 /**
2716 * Emits an instruction after @inst to store the value to be written
2717 * to @orig_dst to scratch space at @base_offset, from @temp.
2718 *
2719 * @base_offset is measured in 32-byte units (the size of a register).
2720 */
2721 void
2722 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2723 {
2724 int reg_offset = base_offset + inst->dst.reg_offset;
2725 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2726
2727 /* Create a temporary register to store *inst's result in.
2728 *
2729 * We have to be careful in MOVing from our temporary result register in
2730 * the scratch write. If we swizzle from channels of the temporary that
2731 * weren't initialized, it will confuse live interval analysis, which will
2732 * make spilling fail to make progress.
2733 */
2734 src_reg temp = src_reg(this, glsl_type::vec4_type);
2735 temp.type = inst->dst.type;
2736 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2737 int swizzles[4];
2738 for (int i = 0; i < 4; i++)
2739 if (inst->dst.writemask & (1 << i))
2740 swizzles[i] = i;
2741 else
2742 swizzles[i] = first_writemask_chan;
2743 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2744 swizzles[2], swizzles[3]);
2745
2746 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2747 inst->dst.writemask));
2748 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2749 write->predicate = inst->predicate;
2750 write->ir = inst->ir;
2751 write->annotation = inst->annotation;
2752 inst->insert_after(write);
2753
2754 inst->dst.file = temp.file;
2755 inst->dst.reg = temp.reg;
2756 inst->dst.reg_offset = temp.reg_offset;
2757 inst->dst.reladdr = NULL;
2758 }
2759
2760 /**
2761 * We can't generally support array access in GRF space, because a
2762 * single instruction's destination can only span 2 contiguous
2763 * registers. So, we send all GRF arrays that get variable index
2764 * access to scratch space.
2765 */
2766 void
2767 vec4_visitor::move_grf_array_access_to_scratch()
2768 {
2769 int scratch_loc[this->virtual_grf_count];
2770
2771 for (int i = 0; i < this->virtual_grf_count; i++) {
2772 scratch_loc[i] = -1;
2773 }
2774
2775 /* First, calculate the set of virtual GRFs that need to be punted
2776 * to scratch due to having any array access on them, and where in
2777 * scratch.
2778 */
2779 foreach_list(node, &this->instructions) {
2780 vec4_instruction *inst = (vec4_instruction *)node;
2781
2782 if (inst->dst.file == GRF && inst->dst.reladdr &&
2783 scratch_loc[inst->dst.reg] == -1) {
2784 scratch_loc[inst->dst.reg] = c->last_scratch;
2785 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2786 }
2787
2788 for (int i = 0 ; i < 3; i++) {
2789 src_reg *src = &inst->src[i];
2790
2791 if (src->file == GRF && src->reladdr &&
2792 scratch_loc[src->reg] == -1) {
2793 scratch_loc[src->reg] = c->last_scratch;
2794 c->last_scratch += this->virtual_grf_sizes[src->reg];
2795 }
2796 }
2797 }
2798
2799 /* Now, for anything that will be accessed through scratch, rewrite
2800 * it to load/store. Note that this is a _safe list walk, because
2801 * we may generate a new scratch_write instruction after the one
2802 * we're processing.
2803 */
2804 foreach_list_safe(node, &this->instructions) {
2805 vec4_instruction *inst = (vec4_instruction *)node;
2806
2807 /* Set up the annotation tracking for new generated instructions. */
2808 base_ir = inst->ir;
2809 current_annotation = inst->annotation;
2810
2811 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2812 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2813 }
2814
2815 for (int i = 0 ; i < 3; i++) {
2816 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2817 continue;
2818
2819 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2820
2821 emit_scratch_read(inst, temp, inst->src[i],
2822 scratch_loc[inst->src[i].reg]);
2823
2824 inst->src[i].file = temp.file;
2825 inst->src[i].reg = temp.reg;
2826 inst->src[i].reg_offset = temp.reg_offset;
2827 inst->src[i].reladdr = NULL;
2828 }
2829 }
2830 }
2831
2832 /**
2833 * Emits an instruction before @inst to load the value named by @orig_src
2834 * from the pull constant buffer (surface) at @base_offset to @temp.
2835 */
2836 void
2837 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2838 dst_reg temp, src_reg orig_src,
2839 int base_offset)
2840 {
2841 int reg_offset = base_offset + orig_src.reg_offset;
2842 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2843 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2844 vec4_instruction *load;
2845
2846 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2847 temp, index, offset);
2848 load->base_mrf = 14;
2849 load->mlen = 1;
2850 emit_before(inst, load);
2851 }
2852
2853 /**
2854 * Implements array access of uniforms by inserting a
2855 * PULL_CONSTANT_LOAD instruction.
2856 *
2857 * Unlike temporary GRF array access (where we don't support it due to
2858 * the difficulty of doing relative addressing on instruction
2859 * destinations), we could potentially do array access of uniforms
2860 * that were loaded in GRF space as push constants. In real-world
2861 * usage we've seen, though, the arrays being used are always larger
2862 * than we could load as push constants, so just always move all
2863 * uniform array access out to a pull constant buffer.
2864 */
2865 void
2866 vec4_visitor::move_uniform_array_access_to_pull_constants()
2867 {
2868 int pull_constant_loc[this->uniforms];
2869
2870 for (int i = 0; i < this->uniforms; i++) {
2871 pull_constant_loc[i] = -1;
2872 }
2873
2874 /* Walk through and find array access of uniforms. Put a copy of that
2875 * uniform in the pull constant buffer.
2876 *
2877 * Note that we don't move constant-indexed accesses to arrays. No
2878 * testing has been done of the performance impact of this choice.
2879 */
2880 foreach_list_safe(node, &this->instructions) {
2881 vec4_instruction *inst = (vec4_instruction *)node;
2882
2883 for (int i = 0 ; i < 3; i++) {
2884 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2885 continue;
2886
2887 int uniform = inst->src[i].reg;
2888
2889 /* If this array isn't already present in the pull constant buffer,
2890 * add it.
2891 */
2892 if (pull_constant_loc[uniform] == -1) {
2893 const float **values = &prog_data->param[uniform * 4];
2894
2895 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2896
2897 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2898 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2899 }
2900 }
2901
2902 /* Set up the annotation tracking for new generated instructions. */
2903 base_ir = inst->ir;
2904 current_annotation = inst->annotation;
2905
2906 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2907
2908 emit_pull_constant_load(inst, temp, inst->src[i],
2909 pull_constant_loc[uniform]);
2910
2911 inst->src[i].file = temp.file;
2912 inst->src[i].reg = temp.reg;
2913 inst->src[i].reg_offset = temp.reg_offset;
2914 inst->src[i].reladdr = NULL;
2915 }
2916 }
2917
2918 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2919 * no need to track them as larger-than-vec4 objects. This will be
2920 * relied on in cutting out unused uniform vectors from push
2921 * constants.
2922 */
2923 split_uniform_registers();
2924 }
2925
2926 void
2927 vec4_visitor::resolve_ud_negate(src_reg *reg)
2928 {
2929 if (reg->type != BRW_REGISTER_TYPE_UD ||
2930 !reg->negate)
2931 return;
2932
2933 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2934 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2935 *reg = temp;
2936 }
2937
2938 vec4_visitor::vec4_visitor(struct brw_context *brw,
2939 struct brw_vs_compile *c,
2940 struct gl_shader_program *prog,
2941 struct brw_shader *shader,
2942 void *mem_ctx)
2943 {
2944 this->c = c;
2945 this->brw = brw;
2946 this->intel = &brw->intel;
2947 this->ctx = &intel->ctx;
2948 this->prog = prog;
2949 this->shader = shader;
2950
2951 this->mem_ctx = mem_ctx;
2952 this->failed = false;
2953
2954 this->base_ir = NULL;
2955 this->current_annotation = NULL;
2956 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2957
2958 this->c = c;
2959 this->vp = &c->vp->program;
2960 this->prog_data = &c->prog_data;
2961
2962 this->variable_ht = hash_table_ctor(0,
2963 hash_table_pointer_hash,
2964 hash_table_pointer_compare);
2965
2966 this->virtual_grf_def = NULL;
2967 this->virtual_grf_use = NULL;
2968 this->virtual_grf_sizes = NULL;
2969 this->virtual_grf_count = 0;
2970 this->virtual_grf_reg_map = NULL;
2971 this->virtual_grf_reg_count = 0;
2972 this->virtual_grf_array_size = 0;
2973 this->live_intervals_valid = false;
2974
2975 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2976
2977 this->uniforms = 0;
2978 }
2979
2980 vec4_visitor::~vec4_visitor()
2981 {
2982 hash_table_dtor(this->variable_ht);
2983 }
2984
2985
2986 void
2987 vec4_visitor::fail(const char *format, ...)
2988 {
2989 va_list va;
2990 char *msg;
2991
2992 if (failed)
2993 return;
2994
2995 failed = true;
2996
2997 va_start(va, format);
2998 msg = ralloc_vasprintf(mem_ctx, format, va);
2999 va_end(va);
3000 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3001
3002 this->fail_msg = msg;
3003
3004 if (INTEL_DEBUG & DEBUG_VS) {
3005 fprintf(stderr, "%s", msg);
3006 }
3007 }
3008
3009 } /* namespace brw */