i965/vs: split brw_vs_compile into generic and VS-specific parts.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 ALU1(NOT)
111 ALU1(MOV)
112 ALU1(FRC)
113 ALU1(RNDD)
114 ALU1(RNDE)
115 ALU1(RNDZ)
116 ALU1(F32TO16)
117 ALU1(F16TO32)
118 ALU2(ADD)
119 ALU2(MUL)
120 ALU2(MACH)
121 ALU2(AND)
122 ALU2(OR)
123 ALU2(XOR)
124 ALU2(DP3)
125 ALU2(DP4)
126 ALU2(DPH)
127 ALU2(SHL)
128 ALU2(SHR)
129 ALU2(ASR)
130
131 /** Gen4 predicated IF. */
132 vec4_instruction *
133 vec4_visitor::IF(uint32_t predicate)
134 {
135 vec4_instruction *inst;
136
137 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
138 inst->predicate = predicate;
139
140 return inst;
141 }
142
143 /** Gen6+ IF with embedded comparison. */
144 vec4_instruction *
145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
146 {
147 assert(intel->gen >= 6);
148
149 vec4_instruction *inst;
150
151 resolve_ud_negate(&src0);
152 resolve_ud_negate(&src1);
153
154 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
155 src0, src1);
156 inst->conditional_mod = condition;
157
158 return inst;
159 }
160
161 /**
162 * CMP: Sets the low bit of the destination channels with the result
163 * of the comparison, while the upper bits are undefined, and updates
164 * the flag register with the packed 16 bits of the result.
165 */
166 vec4_instruction *
167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
168 {
169 vec4_instruction *inst;
170
171 /* original gen4 does type conversion to the destination type
172 * before before comparison, producing garbage results for floating
173 * point comparisons.
174 */
175 if (intel->gen == 4) {
176 dst.type = src0.type;
177 if (dst.file == HW_REG)
178 dst.fixed_hw_reg.type = dst.type;
179 }
180
181 resolve_ud_negate(&src0);
182 resolve_ud_negate(&src1);
183
184 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
185 inst->conditional_mod = condition;
186
187 return inst;
188 }
189
190 vec4_instruction *
191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
196 dst, index);
197 inst->base_mrf = 14;
198 inst->mlen = 2;
199
200 return inst;
201 }
202
203 vec4_instruction *
204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
205 {
206 vec4_instruction *inst;
207
208 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
209 dst, src, index);
210 inst->base_mrf = 13;
211 inst->mlen = 3;
212
213 return inst;
214 }
215
216 void
217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
218 {
219 static enum opcode dot_opcodes[] = {
220 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
221 };
222
223 emit(dot_opcodes[elements - 2], dst, src0, src1);
224 }
225
226 src_reg
227 vec4_visitor::fix_math_operand(src_reg src)
228 {
229 /* The gen6 math instruction ignores the source modifiers --
230 * swizzle, abs, negate, and at least some parts of the register
231 * region description.
232 *
233 * Rather than trying to enumerate all these cases, *always* expand the
234 * operand to a temp GRF for gen6.
235 *
236 * For gen7, keep the operand as-is, except if immediate, which gen7 still
237 * can't use.
238 */
239
240 if (intel->gen == 7 && src.file != IMM)
241 return src;
242
243 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
244 expanded.type = src.type;
245 emit(MOV(expanded, src));
246 return src_reg(expanded);
247 }
248
249 void
250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
251 {
252 src = fix_math_operand(src);
253
254 if (dst.writemask != WRITEMASK_XYZW) {
255 /* The gen6 math instruction must be align1, so we can't do
256 * writemasks.
257 */
258 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
259
260 emit(opcode, temp_dst, src);
261
262 emit(MOV(dst, src_reg(temp_dst)));
263 } else {
264 emit(opcode, dst, src);
265 }
266 }
267
268 void
269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
270 {
271 vec4_instruction *inst = emit(opcode, dst, src);
272 inst->base_mrf = 1;
273 inst->mlen = 1;
274 }
275
276 void
277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
278 {
279 switch (opcode) {
280 case SHADER_OPCODE_RCP:
281 case SHADER_OPCODE_RSQ:
282 case SHADER_OPCODE_SQRT:
283 case SHADER_OPCODE_EXP2:
284 case SHADER_OPCODE_LOG2:
285 case SHADER_OPCODE_SIN:
286 case SHADER_OPCODE_COS:
287 break;
288 default:
289 assert(!"not reached: bad math opcode");
290 return;
291 }
292
293 if (intel->gen >= 6) {
294 return emit_math1_gen6(opcode, dst, src);
295 } else {
296 return emit_math1_gen4(opcode, dst, src);
297 }
298 }
299
300 void
301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
302 dst_reg dst, src_reg src0, src_reg src1)
303 {
304 src0 = fix_math_operand(src0);
305 src1 = fix_math_operand(src1);
306
307 if (dst.writemask != WRITEMASK_XYZW) {
308 /* The gen6 math instruction must be align1, so we can't do
309 * writemasks.
310 */
311 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
312 temp_dst.type = dst.type;
313
314 emit(opcode, temp_dst, src0, src1);
315
316 emit(MOV(dst, src_reg(temp_dst)));
317 } else {
318 emit(opcode, dst, src0, src1);
319 }
320 }
321
322 void
323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
324 dst_reg dst, src_reg src0, src_reg src1)
325 {
326 vec4_instruction *inst = emit(opcode, dst, src0, src1);
327 inst->base_mrf = 1;
328 inst->mlen = 2;
329 }
330
331 void
332 vec4_visitor::emit_math(enum opcode opcode,
333 dst_reg dst, src_reg src0, src_reg src1)
334 {
335 switch (opcode) {
336 case SHADER_OPCODE_POW:
337 case SHADER_OPCODE_INT_QUOTIENT:
338 case SHADER_OPCODE_INT_REMAINDER:
339 break;
340 default:
341 assert(!"not reached: unsupported binary math opcode");
342 return;
343 }
344
345 if (intel->gen >= 6) {
346 return emit_math2_gen6(opcode, dst, src0, src1);
347 } else {
348 return emit_math2_gen4(opcode, dst, src0, src1);
349 }
350 }
351
352 void
353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
354 {
355 if (intel->gen < 7)
356 assert(!"ir_unop_pack_half_2x16 should be lowered");
357
358 assert(dst.type == BRW_REGISTER_TYPE_UD);
359 assert(src0.type == BRW_REGISTER_TYPE_F);
360
361 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
362 *
363 * Because this instruction does not have a 16-bit floating-point type,
364 * the destination data type must be Word (W).
365 *
366 * The destination must be DWord-aligned and specify a horizontal stride
367 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
368 * each destination channel and the upper word is not modified.
369 *
370 * The above restriction implies that the f32to16 instruction must use
371 * align1 mode, because only in align1 mode is it possible to specify
372 * horizontal stride. We choose here to defy the hardware docs and emit
373 * align16 instructions.
374 *
375 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
376 * instructions. I was partially successful in that the code passed all
377 * tests. However, the code was dubiously correct and fragile, and the
378 * tests were not harsh enough to probe that frailty. Not trusting the
379 * code, I chose instead to remain in align16 mode in defiance of the hw
380 * docs).
381 *
382 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
383 * simulator, emitting a f32to16 in align16 mode with UD as destination
384 * data type is safe. The behavior differs from that specified in the PRM
385 * in that the upper word of each destination channel is cleared to 0.
386 */
387
388 dst_reg tmp_dst(this, glsl_type::uvec2_type);
389 src_reg tmp_src(tmp_dst);
390
391 #if 0
392 /* Verify the undocumented behavior on which the following instructions
393 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
394 * then the result of the bit-or instruction below will be incorrect.
395 *
396 * You should inspect the disasm output in order to verify that the MOV is
397 * not optimized away.
398 */
399 emit(MOV(tmp_dst, src_reg(0x12345678u)));
400 #endif
401
402 /* Give tmp the form below, where "." means untouched.
403 *
404 * w z y x w z y x
405 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
406 *
407 * That the upper word of each write-channel be 0 is required for the
408 * following bit-shift and bit-or instructions to work. Note that this
409 * relies on the undocumented hardware behavior mentioned above.
410 */
411 tmp_dst.writemask = WRITEMASK_XY;
412 emit(F32TO16(tmp_dst, src0));
413
414 /* Give the write-channels of dst the form:
415 * 0xhhhh0000
416 */
417 tmp_src.swizzle = SWIZZLE_Y;
418 emit(SHL(dst, tmp_src, src_reg(16u)));
419
420 /* Finally, give the write-channels of dst the form of packHalf2x16's
421 * output:
422 * 0xhhhhllll
423 */
424 tmp_src.swizzle = SWIZZLE_X;
425 emit(OR(dst, src_reg(dst), tmp_src));
426 }
427
428 void
429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
430 {
431 if (intel->gen < 7)
432 assert(!"ir_unop_unpack_half_2x16 should be lowered");
433
434 assert(dst.type == BRW_REGISTER_TYPE_F);
435 assert(src0.type == BRW_REGISTER_TYPE_UD);
436
437 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
438 *
439 * Because this instruction does not have a 16-bit floating-point type,
440 * the source data type must be Word (W). The destination type must be
441 * F (Float).
442 *
443 * To use W as the source data type, we must adjust horizontal strides,
444 * which is only possible in align1 mode. All my [chadv] attempts at
445 * emitting align1 instructions for unpackHalf2x16 failed to pass the
446 * Piglit tests, so I gave up.
447 *
448 * I've verified that, on gen7 hardware and the simulator, it is safe to
449 * emit f16to32 in align16 mode with UD as source data type.
450 */
451
452 dst_reg tmp_dst(this, glsl_type::uvec2_type);
453 src_reg tmp_src(tmp_dst);
454
455 tmp_dst.writemask = WRITEMASK_X;
456 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
457
458 tmp_dst.writemask = WRITEMASK_Y;
459 emit(SHR(tmp_dst, src0, src_reg(16u)));
460
461 dst.writemask = WRITEMASK_XY;
462 emit(F16TO32(dst, tmp_src));
463 }
464
465 void
466 vec4_visitor::visit_instructions(const exec_list *list)
467 {
468 foreach_list(node, list) {
469 ir_instruction *ir = (ir_instruction *)node;
470
471 base_ir = ir;
472 ir->accept(this);
473 }
474 }
475
476
477 static int
478 type_size(const struct glsl_type *type)
479 {
480 unsigned int i;
481 int size;
482
483 switch (type->base_type) {
484 case GLSL_TYPE_UINT:
485 case GLSL_TYPE_INT:
486 case GLSL_TYPE_FLOAT:
487 case GLSL_TYPE_BOOL:
488 if (type->is_matrix()) {
489 return type->matrix_columns;
490 } else {
491 /* Regardless of size of vector, it gets a vec4. This is bad
492 * packing for things like floats, but otherwise arrays become a
493 * mess. Hopefully a later pass over the code can pack scalars
494 * down if appropriate.
495 */
496 return 1;
497 }
498 case GLSL_TYPE_ARRAY:
499 assert(type->length > 0);
500 return type_size(type->fields.array) * type->length;
501 case GLSL_TYPE_STRUCT:
502 size = 0;
503 for (i = 0; i < type->length; i++) {
504 size += type_size(type->fields.structure[i].type);
505 }
506 return size;
507 case GLSL_TYPE_SAMPLER:
508 /* Samplers take up one slot in UNIFORMS[], but they're baked in
509 * at link time.
510 */
511 return 1;
512 case GLSL_TYPE_VOID:
513 case GLSL_TYPE_ERROR:
514 case GLSL_TYPE_INTERFACE:
515 assert(0);
516 break;
517 }
518
519 return 0;
520 }
521
522 int
523 vec4_visitor::virtual_grf_alloc(int size)
524 {
525 if (virtual_grf_array_size <= virtual_grf_count) {
526 if (virtual_grf_array_size == 0)
527 virtual_grf_array_size = 16;
528 else
529 virtual_grf_array_size *= 2;
530 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
531 virtual_grf_array_size);
532 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
533 virtual_grf_array_size);
534 }
535 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
536 virtual_grf_reg_count += size;
537 virtual_grf_sizes[virtual_grf_count] = size;
538 return virtual_grf_count++;
539 }
540
541 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
542 {
543 init();
544
545 this->file = GRF;
546 this->reg = v->virtual_grf_alloc(type_size(type));
547
548 if (type->is_array() || type->is_record()) {
549 this->swizzle = BRW_SWIZZLE_NOOP;
550 } else {
551 this->swizzle = swizzle_for_size(type->vector_elements);
552 }
553
554 this->type = brw_type_for_base_type(type);
555 }
556
557 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
558 {
559 init();
560
561 this->file = GRF;
562 this->reg = v->virtual_grf_alloc(type_size(type));
563
564 if (type->is_array() || type->is_record()) {
565 this->writemask = WRITEMASK_XYZW;
566 } else {
567 this->writemask = (1 << type->vector_elements) - 1;
568 }
569
570 this->type = brw_type_for_base_type(type);
571 }
572
573 /* Our support for uniforms is piggy-backed on the struct
574 * gl_fragment_program, because that's where the values actually
575 * get stored, rather than in some global gl_shader_program uniform
576 * store.
577 */
578 void
579 vec4_visitor::setup_uniform_values(ir_variable *ir)
580 {
581 int namelen = strlen(ir->name);
582
583 /* The data for our (non-builtin) uniforms is stored in a series of
584 * gl_uniform_driver_storage structs for each subcomponent that
585 * glGetUniformLocation() could name. We know it's been set up in the same
586 * order we'd walk the type, so walk the list of storage and find anything
587 * with our name, or the prefix of a component that starts with our name.
588 */
589 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
590 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
591
592 if (strncmp(ir->name, storage->name, namelen) != 0 ||
593 (storage->name[namelen] != 0 &&
594 storage->name[namelen] != '.' &&
595 storage->name[namelen] != '[')) {
596 continue;
597 }
598
599 gl_constant_value *components = storage->storage;
600 unsigned vector_count = (MAX2(storage->array_elements, 1) *
601 storage->type->matrix_columns);
602
603 for (unsigned s = 0; s < vector_count; s++) {
604 uniform_vector_size[uniforms] = storage->type->vector_elements;
605
606 int i;
607 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
608 prog_data->param[uniforms * 4 + i] = &components->f;
609 components++;
610 }
611 for (; i < 4; i++) {
612 static float zero = 0;
613 prog_data->param[uniforms * 4 + i] = &zero;
614 }
615
616 uniforms++;
617 }
618 }
619 }
620
621 void
622 vec4_visitor::setup_uniform_clipplane_values()
623 {
624 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
625
626 if (intel->gen < 6) {
627 /* Pre-Gen6, we compact clip planes. For example, if the user
628 * enables just clip planes 0, 1, and 3, we will enable clip planes
629 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
630 * plane 2. This simplifies the implementation of the Gen6 clip
631 * thread.
632 */
633 int compacted_clipplane_index = 0;
634 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
635 if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
636 continue;
637
638 this->uniform_vector_size[this->uniforms] = 4;
639 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
640 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
641 for (int j = 0; j < 4; ++j) {
642 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
643 }
644 ++compacted_clipplane_index;
645 ++this->uniforms;
646 }
647 } else {
648 /* In Gen6 and later, we don't compact clip planes, because this
649 * simplifies the implementation of gl_ClipDistance.
650 */
651 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
652 this->uniform_vector_size[this->uniforms] = 4;
653 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
654 this->userplane[i].type = BRW_REGISTER_TYPE_F;
655 for (int j = 0; j < 4; ++j) {
656 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
657 }
658 ++this->uniforms;
659 }
660 }
661 }
662
663 /* Our support for builtin uniforms is even scarier than non-builtin.
664 * It sits on top of the PROG_STATE_VAR parameters that are
665 * automatically updated from GL context state.
666 */
667 void
668 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
669 {
670 const ir_state_slot *const slots = ir->state_slots;
671 assert(ir->state_slots != NULL);
672
673 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
674 /* This state reference has already been setup by ir_to_mesa,
675 * but we'll get the same index back here. We can reference
676 * ParameterValues directly, since unlike brw_fs.cpp, we never
677 * add new state references during compile.
678 */
679 int index = _mesa_add_state_reference(this->prog->Parameters,
680 (gl_state_index *)slots[i].tokens);
681 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
682
683 this->uniform_vector_size[this->uniforms] = 0;
684 /* Add each of the unique swizzled channels of the element.
685 * This will end up matching the size of the glsl_type of this field.
686 */
687 int last_swiz = -1;
688 for (unsigned int j = 0; j < 4; j++) {
689 int swiz = GET_SWZ(slots[i].swizzle, j);
690 last_swiz = swiz;
691
692 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
693 if (swiz <= last_swiz)
694 this->uniform_vector_size[this->uniforms]++;
695 }
696 this->uniforms++;
697 }
698 }
699
700 dst_reg *
701 vec4_visitor::variable_storage(ir_variable *var)
702 {
703 return (dst_reg *)hash_table_find(this->variable_ht, var);
704 }
705
706 void
707 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
708 {
709 ir_expression *expr = ir->as_expression();
710
711 *predicate = BRW_PREDICATE_NORMAL;
712
713 if (expr) {
714 src_reg op[2];
715 vec4_instruction *inst;
716
717 assert(expr->get_num_operands() <= 2);
718 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
719 expr->operands[i]->accept(this);
720 op[i] = this->result;
721
722 resolve_ud_negate(&op[i]);
723 }
724
725 switch (expr->operation) {
726 case ir_unop_logic_not:
727 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
728 inst->conditional_mod = BRW_CONDITIONAL_Z;
729 break;
730
731 case ir_binop_logic_xor:
732 inst = emit(XOR(dst_null_d(), op[0], op[1]));
733 inst->conditional_mod = BRW_CONDITIONAL_NZ;
734 break;
735
736 case ir_binop_logic_or:
737 inst = emit(OR(dst_null_d(), op[0], op[1]));
738 inst->conditional_mod = BRW_CONDITIONAL_NZ;
739 break;
740
741 case ir_binop_logic_and:
742 inst = emit(AND(dst_null_d(), op[0], op[1]));
743 inst->conditional_mod = BRW_CONDITIONAL_NZ;
744 break;
745
746 case ir_unop_f2b:
747 if (intel->gen >= 6) {
748 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
749 } else {
750 inst = emit(MOV(dst_null_f(), op[0]));
751 inst->conditional_mod = BRW_CONDITIONAL_NZ;
752 }
753 break;
754
755 case ir_unop_i2b:
756 if (intel->gen >= 6) {
757 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
758 } else {
759 inst = emit(MOV(dst_null_d(), op[0]));
760 inst->conditional_mod = BRW_CONDITIONAL_NZ;
761 }
762 break;
763
764 case ir_binop_all_equal:
765 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
766 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
767 break;
768
769 case ir_binop_any_nequal:
770 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
771 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
772 break;
773
774 case ir_unop_any:
775 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
776 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
777 break;
778
779 case ir_binop_greater:
780 case ir_binop_gequal:
781 case ir_binop_less:
782 case ir_binop_lequal:
783 case ir_binop_equal:
784 case ir_binop_nequal:
785 emit(CMP(dst_null_d(), op[0], op[1],
786 brw_conditional_for_comparison(expr->operation)));
787 break;
788
789 default:
790 assert(!"not reached");
791 break;
792 }
793 return;
794 }
795
796 ir->accept(this);
797
798 resolve_ud_negate(&this->result);
799
800 if (intel->gen >= 6) {
801 vec4_instruction *inst = emit(AND(dst_null_d(),
802 this->result, src_reg(1)));
803 inst->conditional_mod = BRW_CONDITIONAL_NZ;
804 } else {
805 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
806 inst->conditional_mod = BRW_CONDITIONAL_NZ;
807 }
808 }
809
810 /**
811 * Emit a gen6 IF statement with the comparison folded into the IF
812 * instruction.
813 */
814 void
815 vec4_visitor::emit_if_gen6(ir_if *ir)
816 {
817 ir_expression *expr = ir->condition->as_expression();
818
819 if (expr) {
820 src_reg op[2];
821 dst_reg temp;
822
823 assert(expr->get_num_operands() <= 2);
824 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
825 expr->operands[i]->accept(this);
826 op[i] = this->result;
827 }
828
829 switch (expr->operation) {
830 case ir_unop_logic_not:
831 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
832 return;
833
834 case ir_binop_logic_xor:
835 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
836 return;
837
838 case ir_binop_logic_or:
839 temp = dst_reg(this, glsl_type::bool_type);
840 emit(OR(temp, op[0], op[1]));
841 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
842 return;
843
844 case ir_binop_logic_and:
845 temp = dst_reg(this, glsl_type::bool_type);
846 emit(AND(temp, op[0], op[1]));
847 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
848 return;
849
850 case ir_unop_f2b:
851 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
852 return;
853
854 case ir_unop_i2b:
855 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
856 return;
857
858 case ir_binop_greater:
859 case ir_binop_gequal:
860 case ir_binop_less:
861 case ir_binop_lequal:
862 case ir_binop_equal:
863 case ir_binop_nequal:
864 emit(IF(op[0], op[1],
865 brw_conditional_for_comparison(expr->operation)));
866 return;
867
868 case ir_binop_all_equal:
869 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
870 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
871 return;
872
873 case ir_binop_any_nequal:
874 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
875 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
876 return;
877
878 case ir_unop_any:
879 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
880 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
881 return;
882
883 default:
884 assert(!"not reached");
885 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
886 return;
887 }
888 return;
889 }
890
891 ir->condition->accept(this);
892
893 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
894 }
895
896 static dst_reg
897 with_writemask(dst_reg const & r, int mask)
898 {
899 dst_reg result = r;
900 result.writemask = mask;
901 return result;
902 }
903
904 void
905 vec4_visitor::emit_attribute_fixups()
906 {
907 dst_reg sign_recovery_shift;
908 dst_reg normalize_factor;
909 dst_reg es3_normalize_factor;
910
911 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
912 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
913 uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
914 dst_reg reg(ATTR, i);
915 dst_reg reg_d = reg;
916 reg_d.type = BRW_REGISTER_TYPE_D;
917 dst_reg reg_ud = reg;
918 reg_ud.type = BRW_REGISTER_TYPE_UD;
919
920 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
921 * come in as floating point conversions of the integer values.
922 */
923 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
924 dst_reg dst = reg;
925 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
926 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
927 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
928 }
929
930 /* Do sign recovery for 2101010 formats if required. */
931 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
932 if (sign_recovery_shift.file == BAD_FILE) {
933 /* shift constant: <22,22,22,30> */
934 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
935 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
936 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
937 }
938
939 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
940 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
941 }
942
943 /* Apply BGRA swizzle if required. */
944 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
945 src_reg temp = src_reg(reg);
946 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
947 emit(MOV(reg, temp));
948 }
949
950 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
951 /* ES 3.0 has different rules for converting signed normalized
952 * fixed-point numbers than desktop GL.
953 */
954 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
955 /* According to equation 2.2 of the ES 3.0 specification,
956 * signed normalization conversion is done by:
957 *
958 * f = c / (2^(b-1)-1)
959 */
960 if (es3_normalize_factor.file == BAD_FILE) {
961 /* mul constant: 1 / (2^(b-1) - 1) */
962 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
963 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
964 src_reg(1.0f / ((1<<9) - 1))));
965 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
966 src_reg(1.0f / ((1<<1) - 1))));
967 }
968
969 dst_reg dst = reg;
970 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
971 emit(MOV(dst, src_reg(reg_d)));
972 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
973 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
974 } else {
975 /* The following equations are from the OpenGL 3.2 specification:
976 *
977 * 2.1 unsigned normalization
978 * f = c/(2^n-1)
979 *
980 * 2.2 signed normalization
981 * f = (2c+1)/(2^n-1)
982 *
983 * Both of these share a common divisor, which is represented by
984 * "normalize_factor" in the code below.
985 */
986 if (normalize_factor.file == BAD_FILE) {
987 /* 1 / (2^b - 1) for b=<10,10,10,2> */
988 normalize_factor = dst_reg(this, glsl_type::vec4_type);
989 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
990 src_reg(1.0f / ((1<<10) - 1))));
991 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
992 src_reg(1.0f / ((1<<2) - 1))));
993 }
994
995 dst_reg dst = reg;
996 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
997 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
998
999 /* For signed normalization, we want the numerator to be 2c+1. */
1000 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1001 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1002 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1003 }
1004
1005 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1006 }
1007 }
1008
1009 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1010 dst_reg dst = reg;
1011 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1012 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1013 }
1014 }
1015 }
1016 }
1017
1018 void
1019 vec4_visitor::visit(ir_variable *ir)
1020 {
1021 dst_reg *reg = NULL;
1022
1023 if (variable_storage(ir))
1024 return;
1025
1026 switch (ir->mode) {
1027 case ir_var_shader_in:
1028 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1029 break;
1030
1031 case ir_var_shader_out:
1032 reg = new(mem_ctx) dst_reg(this, ir->type);
1033
1034 for (int i = 0; i < type_size(ir->type); i++) {
1035 output_reg[ir->location + i] = *reg;
1036 output_reg[ir->location + i].reg_offset = i;
1037 output_reg[ir->location + i].type =
1038 brw_type_for_base_type(ir->type->get_scalar_type());
1039 output_reg_annotation[ir->location + i] = ir->name;
1040 }
1041 break;
1042
1043 case ir_var_auto:
1044 case ir_var_temporary:
1045 reg = new(mem_ctx) dst_reg(this, ir->type);
1046 break;
1047
1048 case ir_var_uniform:
1049 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1050
1051 /* Thanks to the lower_ubo_reference pass, we will see only
1052 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1053 * variables, so no need for them to be in variable_ht.
1054 */
1055 if (ir->is_in_uniform_block())
1056 return;
1057
1058 /* Track how big the whole uniform variable is, in case we need to put a
1059 * copy of its data into pull constants for array access.
1060 */
1061 this->uniform_size[this->uniforms] = type_size(ir->type);
1062
1063 if (!strncmp(ir->name, "gl_", 3)) {
1064 setup_builtin_uniform_values(ir);
1065 } else {
1066 setup_uniform_values(ir);
1067 }
1068 break;
1069
1070 case ir_var_system_value:
1071 /* VertexID is stored by the VF as the last vertex element, but
1072 * we don't represent it with a flag in inputs_read, so we call
1073 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1074 */
1075 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1076 prog_data->uses_vertexid = true;
1077
1078 switch (ir->location) {
1079 case SYSTEM_VALUE_VERTEX_ID:
1080 reg->writemask = WRITEMASK_X;
1081 break;
1082 case SYSTEM_VALUE_INSTANCE_ID:
1083 reg->writemask = WRITEMASK_Y;
1084 break;
1085 default:
1086 assert(!"not reached");
1087 break;
1088 }
1089 break;
1090
1091 default:
1092 assert(!"not reached");
1093 }
1094
1095 reg->type = brw_type_for_base_type(ir->type);
1096 hash_table_insert(this->variable_ht, reg, ir);
1097 }
1098
1099 void
1100 vec4_visitor::visit(ir_loop *ir)
1101 {
1102 dst_reg counter;
1103
1104 /* We don't want debugging output to print the whole body of the
1105 * loop as the annotation.
1106 */
1107 this->base_ir = NULL;
1108
1109 if (ir->counter != NULL) {
1110 this->base_ir = ir->counter;
1111 ir->counter->accept(this);
1112 counter = *(variable_storage(ir->counter));
1113
1114 if (ir->from != NULL) {
1115 this->base_ir = ir->from;
1116 ir->from->accept(this);
1117
1118 emit(MOV(counter, this->result));
1119 }
1120 }
1121
1122 emit(BRW_OPCODE_DO);
1123
1124 if (ir->to) {
1125 this->base_ir = ir->to;
1126 ir->to->accept(this);
1127
1128 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1129 brw_conditional_for_comparison(ir->cmp)));
1130
1131 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1132 inst->predicate = BRW_PREDICATE_NORMAL;
1133 }
1134
1135 visit_instructions(&ir->body_instructions);
1136
1137
1138 if (ir->increment) {
1139 this->base_ir = ir->increment;
1140 ir->increment->accept(this);
1141 emit(ADD(counter, src_reg(counter), this->result));
1142 }
1143
1144 emit(BRW_OPCODE_WHILE);
1145 }
1146
1147 void
1148 vec4_visitor::visit(ir_loop_jump *ir)
1149 {
1150 switch (ir->mode) {
1151 case ir_loop_jump::jump_break:
1152 emit(BRW_OPCODE_BREAK);
1153 break;
1154 case ir_loop_jump::jump_continue:
1155 emit(BRW_OPCODE_CONTINUE);
1156 break;
1157 }
1158 }
1159
1160
1161 void
1162 vec4_visitor::visit(ir_function_signature *ir)
1163 {
1164 assert(0);
1165 (void)ir;
1166 }
1167
1168 void
1169 vec4_visitor::visit(ir_function *ir)
1170 {
1171 /* Ignore function bodies other than main() -- we shouldn't see calls to
1172 * them since they should all be inlined.
1173 */
1174 if (strcmp(ir->name, "main") == 0) {
1175 const ir_function_signature *sig;
1176 exec_list empty;
1177
1178 sig = ir->matching_signature(&empty);
1179
1180 assert(sig);
1181
1182 visit_instructions(&sig->body);
1183 }
1184 }
1185
1186 bool
1187 vec4_visitor::try_emit_sat(ir_expression *ir)
1188 {
1189 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1190 if (!sat_src)
1191 return false;
1192
1193 sat_src->accept(this);
1194 src_reg src = this->result;
1195
1196 this->result = src_reg(this, ir->type);
1197 vec4_instruction *inst;
1198 inst = emit(MOV(dst_reg(this->result), src));
1199 inst->saturate = true;
1200
1201 return true;
1202 }
1203
1204 void
1205 vec4_visitor::emit_bool_comparison(unsigned int op,
1206 dst_reg dst, src_reg src0, src_reg src1)
1207 {
1208 /* original gen4 does destination conversion before comparison. */
1209 if (intel->gen < 5)
1210 dst.type = src0.type;
1211
1212 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1213
1214 dst.type = BRW_REGISTER_TYPE_D;
1215 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1216 }
1217
1218 void
1219 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1220 src_reg src0, src_reg src1)
1221 {
1222 vec4_instruction *inst;
1223
1224 if (intel->gen >= 6) {
1225 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1226 inst->conditional_mod = conditionalmod;
1227 } else {
1228 emit(CMP(dst, src0, src1, conditionalmod));
1229
1230 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1231 inst->predicate = BRW_PREDICATE_NORMAL;
1232 }
1233 }
1234
1235 void
1236 vec4_visitor::visit(ir_expression *ir)
1237 {
1238 unsigned int operand;
1239 src_reg op[Elements(ir->operands)];
1240 src_reg result_src;
1241 dst_reg result_dst;
1242 vec4_instruction *inst;
1243
1244 if (try_emit_sat(ir))
1245 return;
1246
1247 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1248 this->result.file = BAD_FILE;
1249 ir->operands[operand]->accept(this);
1250 if (this->result.file == BAD_FILE) {
1251 printf("Failed to get tree for expression operand:\n");
1252 ir->operands[operand]->print();
1253 exit(1);
1254 }
1255 op[operand] = this->result;
1256
1257 /* Matrix expression operands should have been broken down to vector
1258 * operations already.
1259 */
1260 assert(!ir->operands[operand]->type->is_matrix());
1261 }
1262
1263 int vector_elements = ir->operands[0]->type->vector_elements;
1264 if (ir->operands[1]) {
1265 vector_elements = MAX2(vector_elements,
1266 ir->operands[1]->type->vector_elements);
1267 }
1268
1269 this->result.file = BAD_FILE;
1270
1271 /* Storage for our result. Ideally for an assignment we'd be using
1272 * the actual storage for the result here, instead.
1273 */
1274 result_src = src_reg(this, ir->type);
1275 /* convenience for the emit functions below. */
1276 result_dst = dst_reg(result_src);
1277 /* If nothing special happens, this is the result. */
1278 this->result = result_src;
1279 /* Limit writes to the channels that will be used by result_src later.
1280 * This does limit this temp's use as a temporary for multi-instruction
1281 * sequences.
1282 */
1283 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1284
1285 switch (ir->operation) {
1286 case ir_unop_logic_not:
1287 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1288 * ones complement of the whole register, not just bit 0.
1289 */
1290 emit(XOR(result_dst, op[0], src_reg(1)));
1291 break;
1292 case ir_unop_neg:
1293 op[0].negate = !op[0].negate;
1294 this->result = op[0];
1295 break;
1296 case ir_unop_abs:
1297 op[0].abs = true;
1298 op[0].negate = false;
1299 this->result = op[0];
1300 break;
1301
1302 case ir_unop_sign:
1303 emit(MOV(result_dst, src_reg(0.0f)));
1304
1305 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1306 inst = emit(MOV(result_dst, src_reg(1.0f)));
1307 inst->predicate = BRW_PREDICATE_NORMAL;
1308
1309 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1310 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1311 inst->predicate = BRW_PREDICATE_NORMAL;
1312
1313 break;
1314
1315 case ir_unop_rcp:
1316 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1317 break;
1318
1319 case ir_unop_exp2:
1320 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1321 break;
1322 case ir_unop_log2:
1323 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1324 break;
1325 case ir_unop_exp:
1326 case ir_unop_log:
1327 assert(!"not reached: should be handled by ir_explog_to_explog2");
1328 break;
1329 case ir_unop_sin:
1330 case ir_unop_sin_reduced:
1331 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1332 break;
1333 case ir_unop_cos:
1334 case ir_unop_cos_reduced:
1335 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1336 break;
1337
1338 case ir_unop_dFdx:
1339 case ir_unop_dFdy:
1340 assert(!"derivatives not valid in vertex shader");
1341 break;
1342
1343 case ir_unop_noise:
1344 assert(!"not reached: should be handled by lower_noise");
1345 break;
1346
1347 case ir_binop_add:
1348 emit(ADD(result_dst, op[0], op[1]));
1349 break;
1350 case ir_binop_sub:
1351 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1352 break;
1353
1354 case ir_binop_mul:
1355 if (ir->type->is_integer()) {
1356 /* For integer multiplication, the MUL uses the low 16 bits
1357 * of one of the operands (src0 on gen6, src1 on gen7). The
1358 * MACH accumulates in the contribution of the upper 16 bits
1359 * of that operand.
1360 *
1361 * FINISHME: Emit just the MUL if we know an operand is small
1362 * enough.
1363 */
1364 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1365
1366 emit(MUL(acc, op[0], op[1]));
1367 emit(MACH(dst_null_d(), op[0], op[1]));
1368 emit(MOV(result_dst, src_reg(acc)));
1369 } else {
1370 emit(MUL(result_dst, op[0], op[1]));
1371 }
1372 break;
1373 case ir_binop_div:
1374 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1375 assert(ir->type->is_integer());
1376 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1377 break;
1378 case ir_binop_mod:
1379 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1380 assert(ir->type->is_integer());
1381 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1382 break;
1383
1384 case ir_binop_less:
1385 case ir_binop_greater:
1386 case ir_binop_lequal:
1387 case ir_binop_gequal:
1388 case ir_binop_equal:
1389 case ir_binop_nequal: {
1390 emit(CMP(result_dst, op[0], op[1],
1391 brw_conditional_for_comparison(ir->operation)));
1392 emit(AND(result_dst, result_src, src_reg(0x1)));
1393 break;
1394 }
1395
1396 case ir_binop_all_equal:
1397 /* "==" operator producing a scalar boolean. */
1398 if (ir->operands[0]->type->is_vector() ||
1399 ir->operands[1]->type->is_vector()) {
1400 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1401 emit(MOV(result_dst, src_reg(0)));
1402 inst = emit(MOV(result_dst, src_reg(1)));
1403 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1404 } else {
1405 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1406 emit(AND(result_dst, result_src, src_reg(0x1)));
1407 }
1408 break;
1409 case ir_binop_any_nequal:
1410 /* "!=" operator producing a scalar boolean. */
1411 if (ir->operands[0]->type->is_vector() ||
1412 ir->operands[1]->type->is_vector()) {
1413 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1414
1415 emit(MOV(result_dst, src_reg(0)));
1416 inst = emit(MOV(result_dst, src_reg(1)));
1417 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1418 } else {
1419 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1420 emit(AND(result_dst, result_src, src_reg(0x1)));
1421 }
1422 break;
1423
1424 case ir_unop_any:
1425 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1426 emit(MOV(result_dst, src_reg(0)));
1427
1428 inst = emit(MOV(result_dst, src_reg(1)));
1429 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1430 break;
1431
1432 case ir_binop_logic_xor:
1433 emit(XOR(result_dst, op[0], op[1]));
1434 break;
1435
1436 case ir_binop_logic_or:
1437 emit(OR(result_dst, op[0], op[1]));
1438 break;
1439
1440 case ir_binop_logic_and:
1441 emit(AND(result_dst, op[0], op[1]));
1442 break;
1443
1444 case ir_binop_dot:
1445 assert(ir->operands[0]->type->is_vector());
1446 assert(ir->operands[0]->type == ir->operands[1]->type);
1447 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1448 break;
1449
1450 case ir_unop_sqrt:
1451 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1452 break;
1453 case ir_unop_rsq:
1454 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1455 break;
1456
1457 case ir_unop_bitcast_i2f:
1458 case ir_unop_bitcast_u2f:
1459 this->result = op[0];
1460 this->result.type = BRW_REGISTER_TYPE_F;
1461 break;
1462
1463 case ir_unop_bitcast_f2i:
1464 this->result = op[0];
1465 this->result.type = BRW_REGISTER_TYPE_D;
1466 break;
1467
1468 case ir_unop_bitcast_f2u:
1469 this->result = op[0];
1470 this->result.type = BRW_REGISTER_TYPE_UD;
1471 break;
1472
1473 case ir_unop_i2f:
1474 case ir_unop_i2u:
1475 case ir_unop_u2i:
1476 case ir_unop_u2f:
1477 case ir_unop_b2f:
1478 case ir_unop_b2i:
1479 case ir_unop_f2i:
1480 case ir_unop_f2u:
1481 emit(MOV(result_dst, op[0]));
1482 break;
1483 case ir_unop_f2b:
1484 case ir_unop_i2b: {
1485 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1486 emit(AND(result_dst, result_src, src_reg(1)));
1487 break;
1488 }
1489
1490 case ir_unop_trunc:
1491 emit(RNDZ(result_dst, op[0]));
1492 break;
1493 case ir_unop_ceil:
1494 op[0].negate = !op[0].negate;
1495 inst = emit(RNDD(result_dst, op[0]));
1496 this->result.negate = true;
1497 break;
1498 case ir_unop_floor:
1499 inst = emit(RNDD(result_dst, op[0]));
1500 break;
1501 case ir_unop_fract:
1502 inst = emit(FRC(result_dst, op[0]));
1503 break;
1504 case ir_unop_round_even:
1505 emit(RNDE(result_dst, op[0]));
1506 break;
1507
1508 case ir_binop_min:
1509 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1510 break;
1511 case ir_binop_max:
1512 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1513 break;
1514
1515 case ir_binop_pow:
1516 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1517 break;
1518
1519 case ir_unop_bit_not:
1520 inst = emit(NOT(result_dst, op[0]));
1521 break;
1522 case ir_binop_bit_and:
1523 inst = emit(AND(result_dst, op[0], op[1]));
1524 break;
1525 case ir_binop_bit_xor:
1526 inst = emit(XOR(result_dst, op[0], op[1]));
1527 break;
1528 case ir_binop_bit_or:
1529 inst = emit(OR(result_dst, op[0], op[1]));
1530 break;
1531
1532 case ir_binop_lshift:
1533 inst = emit(SHL(result_dst, op[0], op[1]));
1534 break;
1535
1536 case ir_binop_rshift:
1537 if (ir->type->base_type == GLSL_TYPE_INT)
1538 inst = emit(ASR(result_dst, op[0], op[1]));
1539 else
1540 inst = emit(SHR(result_dst, op[0], op[1]));
1541 break;
1542
1543 case ir_binop_ubo_load: {
1544 ir_constant *uniform_block = ir->operands[0]->as_constant();
1545 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1546 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1547 src_reg offset = op[1];
1548
1549 /* Now, load the vector from that offset. */
1550 assert(ir->type->is_vector() || ir->type->is_scalar());
1551
1552 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1553 packed_consts.type = result.type;
1554 src_reg surf_index =
1555 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1556 if (const_offset_ir) {
1557 offset = src_reg(const_offset / 16);
1558 } else {
1559 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1560 }
1561
1562 vec4_instruction *pull =
1563 emit(new(mem_ctx) vec4_instruction(this,
1564 VS_OPCODE_PULL_CONSTANT_LOAD,
1565 dst_reg(packed_consts),
1566 surf_index,
1567 offset));
1568 pull->base_mrf = 14;
1569 pull->mlen = 1;
1570
1571 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1572 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1573 const_offset % 16 / 4,
1574 const_offset % 16 / 4,
1575 const_offset % 16 / 4);
1576
1577 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1578 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1579 emit(CMP(result_dst, packed_consts, src_reg(0u),
1580 BRW_CONDITIONAL_NZ));
1581 emit(AND(result_dst, result, src_reg(0x1)));
1582 } else {
1583 emit(MOV(result_dst, packed_consts));
1584 }
1585 break;
1586 }
1587
1588 case ir_triop_lrp:
1589 assert(!"not reached: should be handled by lrp_to_arith");
1590 break;
1591
1592 case ir_quadop_vector:
1593 assert(!"not reached: should be handled by lower_quadop_vector");
1594 break;
1595
1596 case ir_unop_pack_half_2x16:
1597 emit_pack_half_2x16(result_dst, op[0]);
1598 break;
1599 case ir_unop_unpack_half_2x16:
1600 emit_unpack_half_2x16(result_dst, op[0]);
1601 break;
1602 case ir_unop_pack_snorm_2x16:
1603 case ir_unop_pack_snorm_4x8:
1604 case ir_unop_pack_unorm_2x16:
1605 case ir_unop_pack_unorm_4x8:
1606 case ir_unop_unpack_snorm_2x16:
1607 case ir_unop_unpack_snorm_4x8:
1608 case ir_unop_unpack_unorm_2x16:
1609 case ir_unop_unpack_unorm_4x8:
1610 assert(!"not reached: should be handled by lower_packing_builtins");
1611 break;
1612 case ir_unop_unpack_half_2x16_split_x:
1613 case ir_unop_unpack_half_2x16_split_y:
1614 case ir_binop_pack_half_2x16_split:
1615 assert(!"not reached: should not occur in vertex shader");
1616 break;
1617 }
1618 }
1619
1620
1621 void
1622 vec4_visitor::visit(ir_swizzle *ir)
1623 {
1624 src_reg src;
1625 int i = 0;
1626 int swizzle[4];
1627
1628 /* Note that this is only swizzles in expressions, not those on the left
1629 * hand side of an assignment, which do write masking. See ir_assignment
1630 * for that.
1631 */
1632
1633 ir->val->accept(this);
1634 src = this->result;
1635 assert(src.file != BAD_FILE);
1636
1637 for (i = 0; i < ir->type->vector_elements; i++) {
1638 switch (i) {
1639 case 0:
1640 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1641 break;
1642 case 1:
1643 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1644 break;
1645 case 2:
1646 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1647 break;
1648 case 3:
1649 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1650 break;
1651 }
1652 }
1653 for (; i < 4; i++) {
1654 /* Replicate the last channel out. */
1655 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1656 }
1657
1658 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1659
1660 this->result = src;
1661 }
1662
1663 void
1664 vec4_visitor::visit(ir_dereference_variable *ir)
1665 {
1666 const struct glsl_type *type = ir->type;
1667 dst_reg *reg = variable_storage(ir->var);
1668
1669 if (!reg) {
1670 fail("Failed to find variable storage for %s\n", ir->var->name);
1671 this->result = src_reg(brw_null_reg());
1672 return;
1673 }
1674
1675 this->result = src_reg(*reg);
1676
1677 /* System values get their swizzle from the dst_reg writemask */
1678 if (ir->var->mode == ir_var_system_value)
1679 return;
1680
1681 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1682 this->result.swizzle = swizzle_for_size(type->vector_elements);
1683 }
1684
1685 void
1686 vec4_visitor::visit(ir_dereference_array *ir)
1687 {
1688 ir_constant *constant_index;
1689 src_reg src;
1690 int element_size = type_size(ir->type);
1691
1692 constant_index = ir->array_index->constant_expression_value();
1693
1694 ir->array->accept(this);
1695 src = this->result;
1696
1697 if (constant_index) {
1698 src.reg_offset += constant_index->value.i[0] * element_size;
1699 } else {
1700 /* Variable index array dereference. It eats the "vec4" of the
1701 * base of the array and an index that offsets the Mesa register
1702 * index.
1703 */
1704 ir->array_index->accept(this);
1705
1706 src_reg index_reg;
1707
1708 if (element_size == 1) {
1709 index_reg = this->result;
1710 } else {
1711 index_reg = src_reg(this, glsl_type::int_type);
1712
1713 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1714 }
1715
1716 if (src.reladdr) {
1717 src_reg temp = src_reg(this, glsl_type::int_type);
1718
1719 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1720
1721 index_reg = temp;
1722 }
1723
1724 src.reladdr = ralloc(mem_ctx, src_reg);
1725 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1726 }
1727
1728 /* If the type is smaller than a vec4, replicate the last channel out. */
1729 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1730 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1731 else
1732 src.swizzle = BRW_SWIZZLE_NOOP;
1733 src.type = brw_type_for_base_type(ir->type);
1734
1735 this->result = src;
1736 }
1737
1738 void
1739 vec4_visitor::visit(ir_dereference_record *ir)
1740 {
1741 unsigned int i;
1742 const glsl_type *struct_type = ir->record->type;
1743 int offset = 0;
1744
1745 ir->record->accept(this);
1746
1747 for (i = 0; i < struct_type->length; i++) {
1748 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1749 break;
1750 offset += type_size(struct_type->fields.structure[i].type);
1751 }
1752
1753 /* If the type is smaller than a vec4, replicate the last channel out. */
1754 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1755 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1756 else
1757 this->result.swizzle = BRW_SWIZZLE_NOOP;
1758 this->result.type = brw_type_for_base_type(ir->type);
1759
1760 this->result.reg_offset += offset;
1761 }
1762
1763 /**
1764 * We want to be careful in assignment setup to hit the actual storage
1765 * instead of potentially using a temporary like we might with the
1766 * ir_dereference handler.
1767 */
1768 static dst_reg
1769 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1770 {
1771 /* The LHS must be a dereference. If the LHS is a variable indexed array
1772 * access of a vector, it must be separated into a series conditional moves
1773 * before reaching this point (see ir_vec_index_to_cond_assign).
1774 */
1775 assert(ir->as_dereference());
1776 ir_dereference_array *deref_array = ir->as_dereference_array();
1777 if (deref_array) {
1778 assert(!deref_array->array->type->is_vector());
1779 }
1780
1781 /* Use the rvalue deref handler for the most part. We'll ignore
1782 * swizzles in it and write swizzles using writemask, though.
1783 */
1784 ir->accept(v);
1785 return dst_reg(v->result);
1786 }
1787
1788 void
1789 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1790 const struct glsl_type *type, uint32_t predicate)
1791 {
1792 if (type->base_type == GLSL_TYPE_STRUCT) {
1793 for (unsigned int i = 0; i < type->length; i++) {
1794 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1795 }
1796 return;
1797 }
1798
1799 if (type->is_array()) {
1800 for (unsigned int i = 0; i < type->length; i++) {
1801 emit_block_move(dst, src, type->fields.array, predicate);
1802 }
1803 return;
1804 }
1805
1806 if (type->is_matrix()) {
1807 const struct glsl_type *vec_type;
1808
1809 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1810 type->vector_elements, 1);
1811
1812 for (int i = 0; i < type->matrix_columns; i++) {
1813 emit_block_move(dst, src, vec_type, predicate);
1814 }
1815 return;
1816 }
1817
1818 assert(type->is_scalar() || type->is_vector());
1819
1820 dst->type = brw_type_for_base_type(type);
1821 src->type = dst->type;
1822
1823 dst->writemask = (1 << type->vector_elements) - 1;
1824
1825 src->swizzle = swizzle_for_size(type->vector_elements);
1826
1827 vec4_instruction *inst = emit(MOV(*dst, *src));
1828 inst->predicate = predicate;
1829
1830 dst->reg_offset++;
1831 src->reg_offset++;
1832 }
1833
1834
1835 /* If the RHS processing resulted in an instruction generating a
1836 * temporary value, and it would be easy to rewrite the instruction to
1837 * generate its result right into the LHS instead, do so. This ends
1838 * up reliably removing instructions where it can be tricky to do so
1839 * later without real UD chain information.
1840 */
1841 bool
1842 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1843 dst_reg dst,
1844 src_reg src,
1845 vec4_instruction *pre_rhs_inst,
1846 vec4_instruction *last_rhs_inst)
1847 {
1848 /* This could be supported, but it would take more smarts. */
1849 if (ir->condition)
1850 return false;
1851
1852 if (pre_rhs_inst == last_rhs_inst)
1853 return false; /* No instructions generated to work with. */
1854
1855 /* Make sure the last instruction generated our source reg. */
1856 if (src.file != GRF ||
1857 src.file != last_rhs_inst->dst.file ||
1858 src.reg != last_rhs_inst->dst.reg ||
1859 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1860 src.reladdr ||
1861 src.abs ||
1862 src.negate ||
1863 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1864 return false;
1865
1866 /* Check that that last instruction fully initialized the channels
1867 * we want to use, in the order we want to use them. We could
1868 * potentially reswizzle the operands of many instructions so that
1869 * we could handle out of order channels, but don't yet.
1870 */
1871
1872 for (unsigned i = 0; i < 4; i++) {
1873 if (dst.writemask & (1 << i)) {
1874 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1875 return false;
1876
1877 if (BRW_GET_SWZ(src.swizzle, i) != i)
1878 return false;
1879 }
1880 }
1881
1882 /* Success! Rewrite the instruction. */
1883 last_rhs_inst->dst.file = dst.file;
1884 last_rhs_inst->dst.reg = dst.reg;
1885 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1886 last_rhs_inst->dst.reladdr = dst.reladdr;
1887 last_rhs_inst->dst.writemask &= dst.writemask;
1888
1889 return true;
1890 }
1891
1892 void
1893 vec4_visitor::visit(ir_assignment *ir)
1894 {
1895 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1896 uint32_t predicate = BRW_PREDICATE_NONE;
1897
1898 if (!ir->lhs->type->is_scalar() &&
1899 !ir->lhs->type->is_vector()) {
1900 ir->rhs->accept(this);
1901 src_reg src = this->result;
1902
1903 if (ir->condition) {
1904 emit_bool_to_cond_code(ir->condition, &predicate);
1905 }
1906
1907 /* emit_block_move doesn't account for swizzles in the source register.
1908 * This should be ok, since the source register is a structure or an
1909 * array, and those can't be swizzled. But double-check to be sure.
1910 */
1911 assert(src.swizzle ==
1912 (ir->rhs->type->is_matrix()
1913 ? swizzle_for_size(ir->rhs->type->vector_elements)
1914 : BRW_SWIZZLE_NOOP));
1915
1916 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1917 return;
1918 }
1919
1920 /* Now we're down to just a scalar/vector with writemasks. */
1921 int i;
1922
1923 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1924 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1925
1926 ir->rhs->accept(this);
1927
1928 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1929
1930 src_reg src = this->result;
1931
1932 int swizzles[4];
1933 int first_enabled_chan = 0;
1934 int src_chan = 0;
1935
1936 assert(ir->lhs->type->is_vector() ||
1937 ir->lhs->type->is_scalar());
1938 dst.writemask = ir->write_mask;
1939
1940 for (int i = 0; i < 4; i++) {
1941 if (dst.writemask & (1 << i)) {
1942 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1943 break;
1944 }
1945 }
1946
1947 /* Swizzle a small RHS vector into the channels being written.
1948 *
1949 * glsl ir treats write_mask as dictating how many channels are
1950 * present on the RHS while in our instructions we need to make
1951 * those channels appear in the slots of the vec4 they're written to.
1952 */
1953 for (int i = 0; i < 4; i++) {
1954 if (dst.writemask & (1 << i))
1955 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1956 else
1957 swizzles[i] = first_enabled_chan;
1958 }
1959 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1960 swizzles[2], swizzles[3]);
1961
1962 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1963 return;
1964 }
1965
1966 if (ir->condition) {
1967 emit_bool_to_cond_code(ir->condition, &predicate);
1968 }
1969
1970 for (i = 0; i < type_size(ir->lhs->type); i++) {
1971 vec4_instruction *inst = emit(MOV(dst, src));
1972 inst->predicate = predicate;
1973
1974 dst.reg_offset++;
1975 src.reg_offset++;
1976 }
1977 }
1978
1979 void
1980 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1981 {
1982 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1983 foreach_list(node, &ir->components) {
1984 ir_constant *field_value = (ir_constant *)node;
1985
1986 emit_constant_values(dst, field_value);
1987 }
1988 return;
1989 }
1990
1991 if (ir->type->is_array()) {
1992 for (unsigned int i = 0; i < ir->type->length; i++) {
1993 emit_constant_values(dst, ir->array_elements[i]);
1994 }
1995 return;
1996 }
1997
1998 if (ir->type->is_matrix()) {
1999 for (int i = 0; i < ir->type->matrix_columns; i++) {
2000 float *vec = &ir->value.f[i * ir->type->vector_elements];
2001
2002 for (int j = 0; j < ir->type->vector_elements; j++) {
2003 dst->writemask = 1 << j;
2004 dst->type = BRW_REGISTER_TYPE_F;
2005
2006 emit(MOV(*dst, src_reg(vec[j])));
2007 }
2008 dst->reg_offset++;
2009 }
2010 return;
2011 }
2012
2013 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2014
2015 for (int i = 0; i < ir->type->vector_elements; i++) {
2016 if (!(remaining_writemask & (1 << i)))
2017 continue;
2018
2019 dst->writemask = 1 << i;
2020 dst->type = brw_type_for_base_type(ir->type);
2021
2022 /* Find other components that match the one we're about to
2023 * write. Emits fewer instructions for things like vec4(0.5,
2024 * 1.5, 1.5, 1.5).
2025 */
2026 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2027 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2028 if (ir->value.b[i] == ir->value.b[j])
2029 dst->writemask |= (1 << j);
2030 } else {
2031 /* u, i, and f storage all line up, so no need for a
2032 * switch case for comparing each type.
2033 */
2034 if (ir->value.u[i] == ir->value.u[j])
2035 dst->writemask |= (1 << j);
2036 }
2037 }
2038
2039 switch (ir->type->base_type) {
2040 case GLSL_TYPE_FLOAT:
2041 emit(MOV(*dst, src_reg(ir->value.f[i])));
2042 break;
2043 case GLSL_TYPE_INT:
2044 emit(MOV(*dst, src_reg(ir->value.i[i])));
2045 break;
2046 case GLSL_TYPE_UINT:
2047 emit(MOV(*dst, src_reg(ir->value.u[i])));
2048 break;
2049 case GLSL_TYPE_BOOL:
2050 emit(MOV(*dst, src_reg(ir->value.b[i])));
2051 break;
2052 default:
2053 assert(!"Non-float/uint/int/bool constant");
2054 break;
2055 }
2056
2057 remaining_writemask &= ~dst->writemask;
2058 }
2059 dst->reg_offset++;
2060 }
2061
2062 void
2063 vec4_visitor::visit(ir_constant *ir)
2064 {
2065 dst_reg dst = dst_reg(this, ir->type);
2066 this->result = src_reg(dst);
2067
2068 emit_constant_values(&dst, ir);
2069 }
2070
2071 void
2072 vec4_visitor::visit(ir_call *ir)
2073 {
2074 assert(!"not reached");
2075 }
2076
2077 void
2078 vec4_visitor::visit(ir_texture *ir)
2079 {
2080 int sampler =
2081 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2082
2083 /* Should be lowered by do_lower_texture_projection */
2084 assert(!ir->projector);
2085
2086 /* Generate code to compute all the subexpression trees. This has to be
2087 * done before loading any values into MRFs for the sampler message since
2088 * generating these values may involve SEND messages that need the MRFs.
2089 */
2090 src_reg coordinate;
2091 if (ir->coordinate) {
2092 ir->coordinate->accept(this);
2093 coordinate = this->result;
2094 }
2095
2096 src_reg shadow_comparitor;
2097 if (ir->shadow_comparitor) {
2098 ir->shadow_comparitor->accept(this);
2099 shadow_comparitor = this->result;
2100 }
2101
2102 const glsl_type *lod_type, *sample_index_type;
2103 src_reg lod, dPdx, dPdy, sample_index;
2104 switch (ir->op) {
2105 case ir_tex:
2106 lod = src_reg(0.0f);
2107 lod_type = glsl_type::float_type;
2108 break;
2109 case ir_txf:
2110 case ir_txl:
2111 case ir_txs:
2112 ir->lod_info.lod->accept(this);
2113 lod = this->result;
2114 lod_type = ir->lod_info.lod->type;
2115 break;
2116 case ir_txf_ms:
2117 ir->lod_info.sample_index->accept(this);
2118 sample_index = this->result;
2119 sample_index_type = ir->lod_info.sample_index->type;
2120 break;
2121 case ir_txd:
2122 ir->lod_info.grad.dPdx->accept(this);
2123 dPdx = this->result;
2124
2125 ir->lod_info.grad.dPdy->accept(this);
2126 dPdy = this->result;
2127
2128 lod_type = ir->lod_info.grad.dPdx->type;
2129 break;
2130 case ir_txb:
2131 case ir_lod:
2132 break;
2133 }
2134
2135 vec4_instruction *inst = NULL;
2136 switch (ir->op) {
2137 case ir_tex:
2138 case ir_txl:
2139 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2140 break;
2141 case ir_txd:
2142 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2143 break;
2144 case ir_txf:
2145 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2146 break;
2147 case ir_txf_ms:
2148 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2149 break;
2150 case ir_txs:
2151 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2152 break;
2153 case ir_txb:
2154 assert(!"TXB is not valid for vertex shaders.");
2155 break;
2156 case ir_lod:
2157 assert(!"LOD is not valid for vertex shaders.");
2158 break;
2159 }
2160
2161 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2162
2163 /* Texel offsets go in the message header; Gen4 also requires headers. */
2164 inst->header_present = use_texture_offset || intel->gen < 5;
2165 inst->base_mrf = 2;
2166 inst->mlen = inst->header_present + 1; /* always at least one */
2167 inst->sampler = sampler;
2168 inst->dst = dst_reg(this, ir->type);
2169 inst->dst.writemask = WRITEMASK_XYZW;
2170 inst->shadow_compare = ir->shadow_comparitor != NULL;
2171
2172 if (use_texture_offset)
2173 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2174
2175 /* MRF for the first parameter */
2176 int param_base = inst->base_mrf + inst->header_present;
2177
2178 if (ir->op == ir_txs) {
2179 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2180 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2181 } else {
2182 int i, coord_mask = 0, zero_mask = 0;
2183 /* Load the coordinate */
2184 /* FINISHME: gl_clamp_mask and saturate */
2185 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2186 coord_mask |= (1 << i);
2187 for (; i < 4; i++)
2188 zero_mask |= (1 << i);
2189
2190 if (ir->offset && ir->op == ir_txf) {
2191 /* It appears that the ld instruction used for txf does its
2192 * address bounds check before adding in the offset. To work
2193 * around this, just add the integer offset to the integer
2194 * texel coordinate, and don't put the offset in the header.
2195 */
2196 ir_constant *offset = ir->offset->as_constant();
2197 assert(offset);
2198
2199 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2200 src_reg src = coordinate;
2201 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2202 BRW_GET_SWZ(src.swizzle, j),
2203 BRW_GET_SWZ(src.swizzle, j),
2204 BRW_GET_SWZ(src.swizzle, j));
2205 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2206 src, offset->value.i[j]));
2207 }
2208 } else {
2209 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2210 coordinate));
2211 }
2212 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2213 src_reg(0)));
2214 /* Load the shadow comparitor */
2215 if (ir->shadow_comparitor) {
2216 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2217 WRITEMASK_X),
2218 shadow_comparitor));
2219 inst->mlen++;
2220 }
2221
2222 /* Load the LOD info */
2223 if (ir->op == ir_tex || ir->op == ir_txl) {
2224 int mrf, writemask;
2225 if (intel->gen >= 5) {
2226 mrf = param_base + 1;
2227 if (ir->shadow_comparitor) {
2228 writemask = WRITEMASK_Y;
2229 /* mlen already incremented */
2230 } else {
2231 writemask = WRITEMASK_X;
2232 inst->mlen++;
2233 }
2234 } else /* intel->gen == 4 */ {
2235 mrf = param_base;
2236 writemask = WRITEMASK_Z;
2237 }
2238 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2239 } else if (ir->op == ir_txf) {
2240 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2241 } else if (ir->op == ir_txf_ms) {
2242 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2243 sample_index));
2244 inst->mlen++;
2245
2246 /* on Gen7, there is an additional MCS parameter here after SI,
2247 * but we don't bother to emit it since it's always zero. If
2248 * we start supporting texturing from CMS surfaces, this will have
2249 * to change
2250 */
2251 } else if (ir->op == ir_txd) {
2252 const glsl_type *type = lod_type;
2253
2254 if (intel->gen >= 5) {
2255 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2256 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2257 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2258 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2259 inst->mlen++;
2260
2261 if (ir->type->vector_elements == 3) {
2262 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2263 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2264 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2265 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2266 inst->mlen++;
2267 }
2268 } else /* intel->gen == 4 */ {
2269 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2270 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2271 inst->mlen += 2;
2272 }
2273 }
2274 }
2275
2276 emit(inst);
2277
2278 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2279 * spec requires layers.
2280 */
2281 if (ir->op == ir_txs) {
2282 glsl_type const *type = ir->sampler->type;
2283 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2284 type->sampler_array) {
2285 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2286 with_writemask(inst->dst, WRITEMASK_Z),
2287 src_reg(inst->dst), src_reg(6));
2288 }
2289 }
2290
2291 swizzle_result(ir, src_reg(inst->dst), sampler);
2292 }
2293
2294 void
2295 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2296 {
2297 int s = c->key.tex.swizzles[sampler];
2298
2299 this->result = src_reg(this, ir->type);
2300 dst_reg swizzled_result(this->result);
2301
2302 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2303 || s == SWIZZLE_NOOP) {
2304 emit(MOV(swizzled_result, orig_val));
2305 return;
2306 }
2307
2308 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2309 int swizzle[4];
2310
2311 for (int i = 0; i < 4; i++) {
2312 switch (GET_SWZ(s, i)) {
2313 case SWIZZLE_ZERO:
2314 zero_mask |= (1 << i);
2315 break;
2316 case SWIZZLE_ONE:
2317 one_mask |= (1 << i);
2318 break;
2319 default:
2320 copy_mask |= (1 << i);
2321 swizzle[i] = GET_SWZ(s, i);
2322 break;
2323 }
2324 }
2325
2326 if (copy_mask) {
2327 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2328 swizzled_result.writemask = copy_mask;
2329 emit(MOV(swizzled_result, orig_val));
2330 }
2331
2332 if (zero_mask) {
2333 swizzled_result.writemask = zero_mask;
2334 emit(MOV(swizzled_result, src_reg(0.0f)));
2335 }
2336
2337 if (one_mask) {
2338 swizzled_result.writemask = one_mask;
2339 emit(MOV(swizzled_result, src_reg(1.0f)));
2340 }
2341 }
2342
2343 void
2344 vec4_visitor::visit(ir_return *ir)
2345 {
2346 assert(!"not reached");
2347 }
2348
2349 void
2350 vec4_visitor::visit(ir_discard *ir)
2351 {
2352 assert(!"not reached");
2353 }
2354
2355 void
2356 vec4_visitor::visit(ir_if *ir)
2357 {
2358 /* Don't point the annotation at the if statement, because then it plus
2359 * the then and else blocks get printed.
2360 */
2361 this->base_ir = ir->condition;
2362
2363 if (intel->gen == 6) {
2364 emit_if_gen6(ir);
2365 } else {
2366 uint32_t predicate;
2367 emit_bool_to_cond_code(ir->condition, &predicate);
2368 emit(IF(predicate));
2369 }
2370
2371 visit_instructions(&ir->then_instructions);
2372
2373 if (!ir->else_instructions.is_empty()) {
2374 this->base_ir = ir->condition;
2375 emit(BRW_OPCODE_ELSE);
2376
2377 visit_instructions(&ir->else_instructions);
2378 }
2379
2380 this->base_ir = ir->condition;
2381 emit(BRW_OPCODE_ENDIF);
2382 }
2383
2384 void
2385 vec4_visitor::emit_ndc_computation()
2386 {
2387 /* Get the position */
2388 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2389
2390 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2391 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2392 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2393
2394 current_annotation = "NDC";
2395 dst_reg ndc_w = ndc;
2396 ndc_w.writemask = WRITEMASK_W;
2397 src_reg pos_w = pos;
2398 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2399 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2400
2401 dst_reg ndc_xyz = ndc;
2402 ndc_xyz.writemask = WRITEMASK_XYZ;
2403
2404 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2405 }
2406
2407 void
2408 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2409 {
2410 if (intel->gen < 6 &&
2411 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2412 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2413 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2414 dst_reg header1_w = header1;
2415 header1_w.writemask = WRITEMASK_W;
2416 GLuint i;
2417
2418 emit(MOV(header1, 0u));
2419
2420 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2421 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2422
2423 current_annotation = "Point size";
2424 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2425 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2426 }
2427
2428 current_annotation = "Clipping flags";
2429 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2430 vec4_instruction *inst;
2431
2432 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2433 src_reg(this->userplane[i])));
2434 inst->conditional_mod = BRW_CONDITIONAL_L;
2435
2436 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2437 inst->predicate = BRW_PREDICATE_NORMAL;
2438 }
2439
2440 /* i965 clipping workaround:
2441 * 1) Test for -ve rhw
2442 * 2) If set,
2443 * set ndc = (0,0,0,0)
2444 * set ucp[6] = 1
2445 *
2446 * Later, clipping will detect ucp[6] and ensure the primitive is
2447 * clipped against all fixed planes.
2448 */
2449 if (brw->has_negative_rhw_bug) {
2450 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2451 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2452 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2453 vec4_instruction *inst;
2454 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2455 inst->predicate = BRW_PREDICATE_NORMAL;
2456 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2457 inst->predicate = BRW_PREDICATE_NORMAL;
2458 }
2459
2460 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2461 } else if (intel->gen < 6) {
2462 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2463 } else {
2464 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2465 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2466 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2467 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2468 }
2469 }
2470 }
2471
2472 void
2473 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2474 {
2475 if (intel->gen < 6) {
2476 /* Clip distance slots are set aside in gen5, but they are not used. It
2477 * is not clear whether we actually need to set aside space for them,
2478 * but the performance cost is negligible.
2479 */
2480 return;
2481 }
2482
2483 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2484 *
2485 * "If a linked set of shaders forming the vertex stage contains no
2486 * static write to gl_ClipVertex or gl_ClipDistance, but the
2487 * application has requested clipping against user clip planes through
2488 * the API, then the coordinate written to gl_Position is used for
2489 * comparison against the user clip planes."
2490 *
2491 * This function is only called if the shader didn't write to
2492 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2493 * if the user wrote to it; otherwise we use gl_Position.
2494 */
2495 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2496 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2497 clip_vertex = VARYING_SLOT_POS;
2498 }
2499
2500 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2501 ++i) {
2502 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2503 src_reg(output_reg[clip_vertex]),
2504 src_reg(this->userplane[i + offset])));
2505 }
2506 }
2507
2508 void
2509 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2510 {
2511 assert (varying < VARYING_SLOT_MAX);
2512 reg.type = output_reg[varying].type;
2513 current_annotation = output_reg_annotation[varying];
2514 /* Copy the register, saturating if necessary */
2515 vec4_instruction *inst = emit(MOV(reg,
2516 src_reg(output_reg[varying])));
2517 if ((varying == VARYING_SLOT_COL0 ||
2518 varying == VARYING_SLOT_COL1 ||
2519 varying == VARYING_SLOT_BFC0 ||
2520 varying == VARYING_SLOT_BFC1) &&
2521 c->key.clamp_vertex_color) {
2522 inst->saturate = true;
2523 }
2524 }
2525
2526 void
2527 vec4_visitor::emit_urb_slot(int mrf, int varying)
2528 {
2529 struct brw_reg hw_reg = brw_message_reg(mrf);
2530 dst_reg reg = dst_reg(MRF, mrf);
2531 reg.type = BRW_REGISTER_TYPE_F;
2532
2533 switch (varying) {
2534 case VARYING_SLOT_PSIZ:
2535 /* PSIZ is always in slot 0, and is coupled with other flags. */
2536 current_annotation = "indices, point width, clip flags";
2537 emit_psiz_and_flags(hw_reg);
2538 break;
2539 case BRW_VARYING_SLOT_NDC:
2540 current_annotation = "NDC";
2541 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2542 break;
2543 case BRW_VARYING_SLOT_POS_DUPLICATE:
2544 case VARYING_SLOT_POS:
2545 current_annotation = "gl_Position";
2546 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2547 break;
2548 case VARYING_SLOT_CLIP_DIST0:
2549 case VARYING_SLOT_CLIP_DIST1:
2550 if (this->c->key.uses_clip_distance) {
2551 emit_generic_urb_slot(reg, varying);
2552 } else {
2553 current_annotation = "user clip distances";
2554 emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2555 }
2556 break;
2557 case VARYING_SLOT_EDGE:
2558 /* This is present when doing unfilled polygons. We're supposed to copy
2559 * the edge flag from the user-provided vertex array
2560 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2561 * of that attribute (starts as 1.0f). This is then used in clipping to
2562 * determine which edges should be drawn as wireframe.
2563 */
2564 current_annotation = "edge flag";
2565 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2566 glsl_type::float_type, WRITEMASK_XYZW))));
2567 break;
2568 case BRW_VARYING_SLOT_PAD:
2569 /* No need to write to this slot */
2570 break;
2571 default:
2572 emit_generic_urb_slot(reg, varying);
2573 break;
2574 }
2575 }
2576
2577 static int
2578 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2579 {
2580 struct intel_context *intel = &brw->intel;
2581
2582 if (intel->gen >= 6) {
2583 /* URB data written (does not include the message header reg) must
2584 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2585 * section 5.4.3.2.2: URB_INTERLEAVED.
2586 *
2587 * URB entries are allocated on a multiple of 1024 bits, so an
2588 * extra 128 bits written here to make the end align to 256 is
2589 * no problem.
2590 */
2591 if ((mlen % 2) != 1)
2592 mlen++;
2593 }
2594
2595 return mlen;
2596 }
2597
2598 /**
2599 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2600 * complete the VS thread.
2601 *
2602 * The VUE layout is documented in Volume 2a.
2603 */
2604 void
2605 vec4_visitor::emit_urb_writes()
2606 {
2607 /* MRF 0 is reserved for the debugger, so start with message header
2608 * in MRF 1.
2609 */
2610 int base_mrf = 1;
2611 int mrf = base_mrf;
2612 /* In the process of generating our URB write message contents, we
2613 * may need to unspill a register or load from an array. Those
2614 * reads would use MRFs 14-15.
2615 */
2616 int max_usable_mrf = 13;
2617
2618 /* The following assertion verifies that max_usable_mrf causes an
2619 * even-numbered amount of URB write data, which will meet gen6's
2620 * requirements for length alignment.
2621 */
2622 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2623
2624 /* First mrf is the g0-based message header containing URB handles and such,
2625 * which is implied in VS_OPCODE_URB_WRITE.
2626 */
2627 mrf++;
2628
2629 if (intel->gen < 6) {
2630 emit_ndc_computation();
2631 }
2632
2633 /* Set up the VUE data for the first URB write */
2634 int slot;
2635 for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2636 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2637
2638 /* If this was max_usable_mrf, we can't fit anything more into this URB
2639 * WRITE.
2640 */
2641 if (mrf > max_usable_mrf) {
2642 slot++;
2643 break;
2644 }
2645 }
2646
2647 bool eot = slot >= prog_data->vue_map.num_slots;
2648 if (eot) {
2649 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2650 emit_shader_time_end();
2651 }
2652 current_annotation = "URB write";
2653 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2654 inst->base_mrf = base_mrf;
2655 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2656 inst->eot = eot;
2657
2658 /* Optional second URB write */
2659 if (!inst->eot) {
2660 mrf = base_mrf + 1;
2661
2662 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2663 assert(mrf < max_usable_mrf);
2664
2665 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2666 }
2667
2668 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2669 emit_shader_time_end();
2670
2671 current_annotation = "URB write";
2672 inst = emit(VS_OPCODE_URB_WRITE);
2673 inst->base_mrf = base_mrf;
2674 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2675 inst->eot = true;
2676 /* URB destination offset. In the previous write, we got MRFs
2677 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2678 * URB row increments, and each of our MRFs is half of one of
2679 * those, since we're doing interleaved writes.
2680 */
2681 inst->offset = (max_usable_mrf - base_mrf) / 2;
2682 }
2683 }
2684
2685 src_reg
2686 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2687 src_reg *reladdr, int reg_offset)
2688 {
2689 /* Because we store the values to scratch interleaved like our
2690 * vertex data, we need to scale the vec4 index by 2.
2691 */
2692 int message_header_scale = 2;
2693
2694 /* Pre-gen6, the message header uses byte offsets instead of vec4
2695 * (16-byte) offset units.
2696 */
2697 if (intel->gen < 6)
2698 message_header_scale *= 16;
2699
2700 if (reladdr) {
2701 src_reg index = src_reg(this, glsl_type::int_type);
2702
2703 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2704 emit_before(inst, MUL(dst_reg(index),
2705 index, src_reg(message_header_scale)));
2706
2707 return index;
2708 } else {
2709 return src_reg(reg_offset * message_header_scale);
2710 }
2711 }
2712
2713 src_reg
2714 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2715 src_reg *reladdr, int reg_offset)
2716 {
2717 if (reladdr) {
2718 src_reg index = src_reg(this, glsl_type::int_type);
2719
2720 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2721
2722 /* Pre-gen6, the message header uses byte offsets instead of vec4
2723 * (16-byte) offset units.
2724 */
2725 if (intel->gen < 6) {
2726 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2727 }
2728
2729 return index;
2730 } else {
2731 int message_header_scale = intel->gen < 6 ? 16 : 1;
2732 return src_reg(reg_offset * message_header_scale);
2733 }
2734 }
2735
2736 /**
2737 * Emits an instruction before @inst to load the value named by @orig_src
2738 * from scratch space at @base_offset to @temp.
2739 *
2740 * @base_offset is measured in 32-byte units (the size of a register).
2741 */
2742 void
2743 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2744 dst_reg temp, src_reg orig_src,
2745 int base_offset)
2746 {
2747 int reg_offset = base_offset + orig_src.reg_offset;
2748 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2749
2750 emit_before(inst, SCRATCH_READ(temp, index));
2751 }
2752
2753 /**
2754 * Emits an instruction after @inst to store the value to be written
2755 * to @orig_dst to scratch space at @base_offset, from @temp.
2756 *
2757 * @base_offset is measured in 32-byte units (the size of a register).
2758 */
2759 void
2760 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2761 {
2762 int reg_offset = base_offset + inst->dst.reg_offset;
2763 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2764
2765 /* Create a temporary register to store *inst's result in.
2766 *
2767 * We have to be careful in MOVing from our temporary result register in
2768 * the scratch write. If we swizzle from channels of the temporary that
2769 * weren't initialized, it will confuse live interval analysis, which will
2770 * make spilling fail to make progress.
2771 */
2772 src_reg temp = src_reg(this, glsl_type::vec4_type);
2773 temp.type = inst->dst.type;
2774 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2775 int swizzles[4];
2776 for (int i = 0; i < 4; i++)
2777 if (inst->dst.writemask & (1 << i))
2778 swizzles[i] = i;
2779 else
2780 swizzles[i] = first_writemask_chan;
2781 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2782 swizzles[2], swizzles[3]);
2783
2784 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2785 inst->dst.writemask));
2786 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2787 write->predicate = inst->predicate;
2788 write->ir = inst->ir;
2789 write->annotation = inst->annotation;
2790 inst->insert_after(write);
2791
2792 inst->dst.file = temp.file;
2793 inst->dst.reg = temp.reg;
2794 inst->dst.reg_offset = temp.reg_offset;
2795 inst->dst.reladdr = NULL;
2796 }
2797
2798 /**
2799 * We can't generally support array access in GRF space, because a
2800 * single instruction's destination can only span 2 contiguous
2801 * registers. So, we send all GRF arrays that get variable index
2802 * access to scratch space.
2803 */
2804 void
2805 vec4_visitor::move_grf_array_access_to_scratch()
2806 {
2807 int scratch_loc[this->virtual_grf_count];
2808
2809 for (int i = 0; i < this->virtual_grf_count; i++) {
2810 scratch_loc[i] = -1;
2811 }
2812
2813 /* First, calculate the set of virtual GRFs that need to be punted
2814 * to scratch due to having any array access on them, and where in
2815 * scratch.
2816 */
2817 foreach_list(node, &this->instructions) {
2818 vec4_instruction *inst = (vec4_instruction *)node;
2819
2820 if (inst->dst.file == GRF && inst->dst.reladdr &&
2821 scratch_loc[inst->dst.reg] == -1) {
2822 scratch_loc[inst->dst.reg] = c->base.last_scratch;
2823 c->base.last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2824 }
2825
2826 for (int i = 0 ; i < 3; i++) {
2827 src_reg *src = &inst->src[i];
2828
2829 if (src->file == GRF && src->reladdr &&
2830 scratch_loc[src->reg] == -1) {
2831 scratch_loc[src->reg] = c->base.last_scratch;
2832 c->base.last_scratch += this->virtual_grf_sizes[src->reg];
2833 }
2834 }
2835 }
2836
2837 /* Now, for anything that will be accessed through scratch, rewrite
2838 * it to load/store. Note that this is a _safe list walk, because
2839 * we may generate a new scratch_write instruction after the one
2840 * we're processing.
2841 */
2842 foreach_list_safe(node, &this->instructions) {
2843 vec4_instruction *inst = (vec4_instruction *)node;
2844
2845 /* Set up the annotation tracking for new generated instructions. */
2846 base_ir = inst->ir;
2847 current_annotation = inst->annotation;
2848
2849 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2850 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2851 }
2852
2853 for (int i = 0 ; i < 3; i++) {
2854 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2855 continue;
2856
2857 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2858
2859 emit_scratch_read(inst, temp, inst->src[i],
2860 scratch_loc[inst->src[i].reg]);
2861
2862 inst->src[i].file = temp.file;
2863 inst->src[i].reg = temp.reg;
2864 inst->src[i].reg_offset = temp.reg_offset;
2865 inst->src[i].reladdr = NULL;
2866 }
2867 }
2868 }
2869
2870 /**
2871 * Emits an instruction before @inst to load the value named by @orig_src
2872 * from the pull constant buffer (surface) at @base_offset to @temp.
2873 */
2874 void
2875 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2876 dst_reg temp, src_reg orig_src,
2877 int base_offset)
2878 {
2879 int reg_offset = base_offset + orig_src.reg_offset;
2880 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2881 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2882 vec4_instruction *load;
2883
2884 if (intel->gen >= 7) {
2885 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2886 grf_offset.type = offset.type;
2887 emit_before(inst, MOV(grf_offset, offset));
2888
2889 load = new(mem_ctx) vec4_instruction(this,
2890 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2891 temp, index, src_reg(grf_offset));
2892 } else {
2893 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2894 temp, index, offset);
2895 load->base_mrf = 14;
2896 load->mlen = 1;
2897 }
2898 emit_before(inst, load);
2899 }
2900
2901 /**
2902 * Implements array access of uniforms by inserting a
2903 * PULL_CONSTANT_LOAD instruction.
2904 *
2905 * Unlike temporary GRF array access (where we don't support it due to
2906 * the difficulty of doing relative addressing on instruction
2907 * destinations), we could potentially do array access of uniforms
2908 * that were loaded in GRF space as push constants. In real-world
2909 * usage we've seen, though, the arrays being used are always larger
2910 * than we could load as push constants, so just always move all
2911 * uniform array access out to a pull constant buffer.
2912 */
2913 void
2914 vec4_visitor::move_uniform_array_access_to_pull_constants()
2915 {
2916 int pull_constant_loc[this->uniforms];
2917
2918 for (int i = 0; i < this->uniforms; i++) {
2919 pull_constant_loc[i] = -1;
2920 }
2921
2922 /* Walk through and find array access of uniforms. Put a copy of that
2923 * uniform in the pull constant buffer.
2924 *
2925 * Note that we don't move constant-indexed accesses to arrays. No
2926 * testing has been done of the performance impact of this choice.
2927 */
2928 foreach_list_safe(node, &this->instructions) {
2929 vec4_instruction *inst = (vec4_instruction *)node;
2930
2931 for (int i = 0 ; i < 3; i++) {
2932 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2933 continue;
2934
2935 int uniform = inst->src[i].reg;
2936
2937 /* If this array isn't already present in the pull constant buffer,
2938 * add it.
2939 */
2940 if (pull_constant_loc[uniform] == -1) {
2941 const float **values = &prog_data->param[uniform * 4];
2942
2943 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2944
2945 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2946 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2947 }
2948 }
2949
2950 /* Set up the annotation tracking for new generated instructions. */
2951 base_ir = inst->ir;
2952 current_annotation = inst->annotation;
2953
2954 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2955
2956 emit_pull_constant_load(inst, temp, inst->src[i],
2957 pull_constant_loc[uniform]);
2958
2959 inst->src[i].file = temp.file;
2960 inst->src[i].reg = temp.reg;
2961 inst->src[i].reg_offset = temp.reg_offset;
2962 inst->src[i].reladdr = NULL;
2963 }
2964 }
2965
2966 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2967 * no need to track them as larger-than-vec4 objects. This will be
2968 * relied on in cutting out unused uniform vectors from push
2969 * constants.
2970 */
2971 split_uniform_registers();
2972 }
2973
2974 void
2975 vec4_visitor::resolve_ud_negate(src_reg *reg)
2976 {
2977 if (reg->type != BRW_REGISTER_TYPE_UD ||
2978 !reg->negate)
2979 return;
2980
2981 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2982 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2983 *reg = temp;
2984 }
2985
2986 vec4_visitor::vec4_visitor(struct brw_context *brw,
2987 struct brw_vs_compile *c,
2988 struct brw_vs_prog_data *prog_data,
2989 struct gl_shader_program *shader_prog,
2990 struct brw_shader *shader,
2991 void *mem_ctx)
2992 {
2993 this->c = c;
2994 this->brw = brw;
2995 this->intel = &brw->intel;
2996 this->ctx = &intel->ctx;
2997 this->shader_prog = shader_prog;
2998 this->shader = shader;
2999
3000 this->mem_ctx = mem_ctx;
3001 this->failed = false;
3002
3003 this->base_ir = NULL;
3004 this->current_annotation = NULL;
3005 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3006
3007 this->c = c;
3008 this->prog = &c->vp->program.Base;
3009 this->prog_data = prog_data;
3010
3011 this->variable_ht = hash_table_ctor(0,
3012 hash_table_pointer_hash,
3013 hash_table_pointer_compare);
3014
3015 this->virtual_grf_def = NULL;
3016 this->virtual_grf_use = NULL;
3017 this->virtual_grf_sizes = NULL;
3018 this->virtual_grf_count = 0;
3019 this->virtual_grf_reg_map = NULL;
3020 this->virtual_grf_reg_count = 0;
3021 this->virtual_grf_array_size = 0;
3022 this->live_intervals_valid = false;
3023
3024 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3025
3026 this->uniforms = 0;
3027 }
3028
3029 vec4_visitor::~vec4_visitor()
3030 {
3031 hash_table_dtor(this->variable_ht);
3032 }
3033
3034
3035 void
3036 vec4_visitor::fail(const char *format, ...)
3037 {
3038 va_list va;
3039 char *msg;
3040
3041 if (failed)
3042 return;
3043
3044 failed = true;
3045
3046 va_start(va, format);
3047 msg = ralloc_vasprintf(mem_ctx, format, va);
3048 va_end(va);
3049 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3050
3051 this->fail_msg = msg;
3052
3053 if (INTEL_DEBUG & DEBUG_VS) {
3054 fprintf(stderr, "%s", msg);
3055 }
3056 }
3057
3058 } /* namespace brw */