i965/vs: Add support for if(any_nequal()) and if(all_equal()) on gen6.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_emit.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "../glsl/ir_print_visitor.h"
26
27 extern "C" {
28 #include "brw_eu.h"
29 };
30
31 using namespace brw;
32
33 namespace brw {
34
35 int
36 vec4_visitor::setup_attributes(int payload_reg)
37 {
38 int nr_attributes;
39 int attribute_map[VERT_ATTRIB_MAX];
40
41 nr_attributes = 0;
42 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
43 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
44 attribute_map[i] = payload_reg + nr_attributes;
45 nr_attributes++;
46 }
47 }
48
49 foreach_iter(exec_list_iterator, iter, this->instructions) {
50 vec4_instruction *inst = (vec4_instruction *)iter.get();
51
52 for (int i = 0; i < 3; i++) {
53 if (inst->src[i].file != ATTR)
54 continue;
55
56 inst->src[i].file = HW_REG;
57 inst->src[i].fixed_hw_reg = brw_vec8_grf(attribute_map[inst->src[i].reg], 0);
58 inst->src[i].fixed_hw_reg.dw1.bits.swizzle = inst->src[i].swizzle;
59 }
60 }
61
62 /* The BSpec says we always have to read at least one thing from
63 * the VF, and it appears that the hardware wedges otherwise.
64 */
65 if (nr_attributes == 0)
66 nr_attributes = 1;
67
68 prog_data->urb_read_length = (nr_attributes + 1) / 2;
69
70 return payload_reg + nr_attributes;
71 }
72
73 int
74 vec4_visitor::setup_uniforms(int reg)
75 {
76 /* User clip planes from curbe:
77 */
78 if (c->key.nr_userclip) {
79 if (intel->gen >= 6) {
80 for (int i = 0; i < c->key.nr_userclip; i++) {
81 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
82 (i % 2) * 4), 0, 4, 1);
83 }
84 reg += ALIGN(c->key.nr_userclip, 2) / 2;
85 } else {
86 for (int i = 0; i < c->key.nr_userclip; i++) {
87 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
88 (i % 2) * 4), 0, 4, 1);
89 }
90 reg += (ALIGN(6 + c->key.nr_userclip, 4) / 4) * 2;
91 }
92 }
93
94 /* The pre-gen6 VS requires that some push constants get loaded no
95 * matter what, or the GPU would hang.
96 */
97 if (intel->gen < 6 && this->uniforms == 0) {
98 this->uniform_size[this->uniforms] = 1;
99
100 for (unsigned int i = 0; i < 4; i++) {
101 unsigned int slot = this->uniforms * 4 + i;
102
103 c->prog_data.param[slot] = NULL;
104 c->prog_data.param_convert[slot] = PARAM_CONVERT_ZERO;
105 }
106
107 this->uniforms++;
108 reg++;
109 } else {
110 reg += ALIGN(uniforms, 2) / 2;
111 }
112
113 /* for now, we are not doing any elimination of unused slots, nor
114 * are we packing our uniforms.
115 */
116 c->prog_data.nr_params = this->uniforms * 4;
117
118 c->prog_data.curb_read_length = reg - 1;
119 c->prog_data.uses_new_param_layout = true;
120
121 return reg;
122 }
123
124 void
125 vec4_visitor::setup_payload(void)
126 {
127 int reg = 0;
128
129 /* The payload always contains important data in g0, which contains
130 * the URB handles that are passed on to the URB write at the end
131 * of the thread. So, we always start push constants at g1.
132 */
133 reg++;
134
135 reg = setup_uniforms(reg);
136
137 reg = setup_attributes(reg);
138
139 this->first_non_payload_grf = reg;
140 }
141
142 struct brw_reg
143 vec4_instruction::get_dst(void)
144 {
145 struct brw_reg brw_reg;
146
147 switch (dst.file) {
148 case GRF:
149 brw_reg = brw_vec8_grf(dst.reg + dst.reg_offset, 0);
150 brw_reg = retype(brw_reg, dst.type);
151 brw_reg.dw1.bits.writemask = dst.writemask;
152 break;
153
154 case HW_REG:
155 brw_reg = dst.fixed_hw_reg;
156 break;
157
158 case BAD_FILE:
159 brw_reg = brw_null_reg();
160 break;
161
162 default:
163 assert(!"not reached");
164 brw_reg = brw_null_reg();
165 break;
166 }
167 return brw_reg;
168 }
169
170 struct brw_reg
171 vec4_instruction::get_src(int i)
172 {
173 struct brw_reg brw_reg;
174
175 switch (src[i].file) {
176 case GRF:
177 brw_reg = brw_vec8_grf(src[i].reg + src[i].reg_offset, 0);
178 brw_reg = retype(brw_reg, src[i].type);
179 brw_reg.dw1.bits.swizzle = src[i].swizzle;
180 if (src[i].abs)
181 brw_reg = brw_abs(brw_reg);
182 if (src[i].negate)
183 brw_reg = negate(brw_reg);
184 break;
185
186 case IMM:
187 switch (src[i].type) {
188 case BRW_REGISTER_TYPE_F:
189 brw_reg = brw_imm_f(src[i].imm.f);
190 break;
191 case BRW_REGISTER_TYPE_D:
192 brw_reg = brw_imm_d(src[i].imm.i);
193 break;
194 case BRW_REGISTER_TYPE_UD:
195 brw_reg = brw_imm_ud(src[i].imm.u);
196 break;
197 default:
198 assert(!"not reached");
199 brw_reg = brw_null_reg();
200 break;
201 }
202 break;
203
204 case UNIFORM:
205 brw_reg = stride(brw_vec4_grf(1 + (src[i].reg + src[i].reg_offset) / 2,
206 ((src[i].reg + src[i].reg_offset) % 2) * 4),
207 0, 4, 1);
208 brw_reg = retype(brw_reg, src[i].type);
209 brw_reg.dw1.bits.swizzle = src[i].swizzle;
210 if (src[i].abs)
211 brw_reg = brw_abs(brw_reg);
212 if (src[i].negate)
213 brw_reg = negate(brw_reg);
214 break;
215
216 case HW_REG:
217 brw_reg = src[i].fixed_hw_reg;
218 break;
219
220 case BAD_FILE:
221 /* Probably unused. */
222 brw_reg = brw_null_reg();
223 break;
224 case ATTR:
225 default:
226 assert(!"not reached");
227 brw_reg = brw_null_reg();
228 break;
229 }
230
231 return brw_reg;
232 }
233
234 void
235 vec4_visitor::generate_math1_gen4(vec4_instruction *inst,
236 struct brw_reg dst,
237 struct brw_reg src)
238 {
239 brw_math(p,
240 dst,
241 brw_math_function(inst->opcode),
242 BRW_MATH_SATURATE_NONE,
243 inst->base_mrf,
244 src,
245 BRW_MATH_DATA_SCALAR,
246 BRW_MATH_PRECISION_FULL);
247 }
248
249 void
250 vec4_visitor::generate_math1_gen6(vec4_instruction *inst,
251 struct brw_reg dst,
252 struct brw_reg src)
253 {
254 brw_math(p,
255 dst,
256 brw_math_function(inst->opcode),
257 BRW_MATH_SATURATE_NONE,
258 inst->base_mrf,
259 src,
260 BRW_MATH_DATA_SCALAR,
261 BRW_MATH_PRECISION_FULL);
262 }
263
264 void
265 vec4_visitor::generate_urb_write(vec4_instruction *inst)
266 {
267 brw_urb_WRITE(p,
268 brw_null_reg(), /* dest */
269 inst->base_mrf, /* starting mrf reg nr */
270 brw_vec8_grf(0, 0), /* src */
271 false, /* allocate */
272 true, /* used */
273 inst->mlen,
274 0, /* response len */
275 inst->eot, /* eot */
276 inst->eot, /* writes complete */
277 inst->offset, /* urb destination offset */
278 BRW_URB_SWIZZLE_INTERLEAVE);
279 }
280
281 void
282 vec4_visitor::generate_vs_instruction(vec4_instruction *instruction,
283 struct brw_reg dst,
284 struct brw_reg *src)
285 {
286 vec4_instruction *inst = (vec4_instruction *)instruction;
287
288 switch (inst->opcode) {
289 case SHADER_OPCODE_RCP:
290 case SHADER_OPCODE_RSQ:
291 case SHADER_OPCODE_SQRT:
292 case SHADER_OPCODE_EXP2:
293 case SHADER_OPCODE_LOG2:
294 case SHADER_OPCODE_SIN:
295 case SHADER_OPCODE_COS:
296 if (intel->gen >= 6) {
297 generate_math1_gen6(inst, dst, src[0]);
298 } else {
299 generate_math1_gen4(inst, dst, src[0]);
300 }
301 break;
302
303 case SHADER_OPCODE_POW:
304 assert(!"finishme");
305 break;
306
307 case VS_OPCODE_URB_WRITE:
308 generate_urb_write(inst);
309 break;
310
311 default:
312 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
313 fail("unsupported opcode in `%s' in VS\n",
314 brw_opcodes[inst->opcode].name);
315 } else {
316 fail("Unsupported opcode %d in VS", inst->opcode);
317 }
318 }
319 }
320
321 bool
322 vec4_visitor::run()
323 {
324 /* Generate FS IR for main(). (the visitor only descends into
325 * functions called "main").
326 */
327 foreach_iter(exec_list_iterator, iter, *shader->ir) {
328 ir_instruction *ir = (ir_instruction *)iter.get();
329 base_ir = ir;
330 ir->accept(this);
331 }
332
333 emit_urb_writes();
334
335 if (failed)
336 return false;
337
338 setup_payload();
339 reg_allocate();
340
341 brw_set_access_mode(p, BRW_ALIGN_16);
342
343 generate_code();
344
345 return !failed;
346 }
347
348 void
349 vec4_visitor::generate_code()
350 {
351 int last_native_inst = p->nr_insn;
352 const char *last_annotation_string = NULL;
353 ir_instruction *last_annotation_ir = NULL;
354
355 int loop_stack_array_size = 16;
356 int loop_stack_depth = 0;
357 brw_instruction **loop_stack =
358 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
359 int *if_depth_in_loop =
360 rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
361
362
363 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
364 printf("Native code for vertex shader %d:\n", prog->Name);
365 }
366
367 foreach_list(node, &this->instructions) {
368 vec4_instruction *inst = (vec4_instruction *)node;
369 struct brw_reg src[3], dst;
370
371 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
372 if (last_annotation_ir != inst->ir) {
373 last_annotation_ir = inst->ir;
374 if (last_annotation_ir) {
375 printf(" ");
376 last_annotation_ir->print();
377 printf("\n");
378 }
379 }
380 if (last_annotation_string != inst->annotation) {
381 last_annotation_string = inst->annotation;
382 if (last_annotation_string)
383 printf(" %s\n", last_annotation_string);
384 }
385 }
386
387 for (unsigned int i = 0; i < 3; i++) {
388 src[i] = inst->get_src(i);
389 }
390 dst = inst->get_dst();
391
392 brw_set_conditionalmod(p, inst->conditional_mod);
393 brw_set_predicate_control(p, inst->predicate);
394 brw_set_predicate_inverse(p, inst->predicate_inverse);
395 brw_set_saturate(p, inst->saturate);
396
397 switch (inst->opcode) {
398 case BRW_OPCODE_MOV:
399 brw_MOV(p, dst, src[0]);
400 break;
401 case BRW_OPCODE_ADD:
402 brw_ADD(p, dst, src[0], src[1]);
403 break;
404 case BRW_OPCODE_MUL:
405 brw_MUL(p, dst, src[0], src[1]);
406 break;
407
408 case BRW_OPCODE_FRC:
409 brw_FRC(p, dst, src[0]);
410 break;
411 case BRW_OPCODE_RNDD:
412 brw_RNDD(p, dst, src[0]);
413 break;
414 case BRW_OPCODE_RNDE:
415 brw_RNDE(p, dst, src[0]);
416 break;
417 case BRW_OPCODE_RNDZ:
418 brw_RNDZ(p, dst, src[0]);
419 break;
420
421 case BRW_OPCODE_AND:
422 brw_AND(p, dst, src[0], src[1]);
423 break;
424 case BRW_OPCODE_OR:
425 brw_OR(p, dst, src[0], src[1]);
426 break;
427 case BRW_OPCODE_XOR:
428 brw_XOR(p, dst, src[0], src[1]);
429 break;
430 case BRW_OPCODE_NOT:
431 brw_NOT(p, dst, src[0]);
432 break;
433 case BRW_OPCODE_ASR:
434 brw_ASR(p, dst, src[0], src[1]);
435 break;
436 case BRW_OPCODE_SHR:
437 brw_SHR(p, dst, src[0], src[1]);
438 break;
439 case BRW_OPCODE_SHL:
440 brw_SHL(p, dst, src[0], src[1]);
441 break;
442
443 case BRW_OPCODE_CMP:
444 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
445 break;
446 case BRW_OPCODE_SEL:
447 brw_SEL(p, dst, src[0], src[1]);
448 break;
449
450 case BRW_OPCODE_DP4:
451 brw_DP4(p, dst, src[0], src[1]);
452 break;
453
454 case BRW_OPCODE_DP3:
455 brw_DP3(p, dst, src[0], src[1]);
456 break;
457
458 case BRW_OPCODE_DP2:
459 brw_DP2(p, dst, src[0], src[1]);
460 break;
461
462 case BRW_OPCODE_IF:
463 if (inst->src[0].file != BAD_FILE) {
464 /* The instruction has an embedded compare (only allowed on gen6) */
465 assert(intel->gen == 6);
466 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
467 } else {
468 struct brw_instruction *brw_inst = brw_IF(p, BRW_EXECUTE_8);
469 brw_inst->header.predicate_control = inst->predicate;
470 }
471 if_depth_in_loop[loop_stack_depth]++;
472 break;
473
474 case BRW_OPCODE_ELSE:
475 brw_ELSE(p);
476 break;
477 case BRW_OPCODE_ENDIF:
478 brw_ENDIF(p);
479 if_depth_in_loop[loop_stack_depth]--;
480 break;
481
482 case BRW_OPCODE_DO:
483 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
484 if (loop_stack_array_size <= loop_stack_depth) {
485 loop_stack_array_size *= 2;
486 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
487 loop_stack_array_size);
488 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
489 loop_stack_array_size);
490 }
491 if_depth_in_loop[loop_stack_depth] = 0;
492 break;
493
494 case BRW_OPCODE_BREAK:
495 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
496 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
497 break;
498 case BRW_OPCODE_CONTINUE:
499 /* FINISHME: We need to write the loop instruction support still. */
500 if (intel->gen >= 6)
501 gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
502 else
503 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
504 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
505 break;
506
507 case BRW_OPCODE_WHILE: {
508 struct brw_instruction *inst0, *inst1;
509 GLuint br = 1;
510
511 if (intel->gen >= 5)
512 br = 2;
513
514 assert(loop_stack_depth > 0);
515 loop_stack_depth--;
516 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
517 if (intel->gen < 6) {
518 /* patch all the BREAK/CONT instructions from last BGNLOOP */
519 while (inst0 > loop_stack[loop_stack_depth]) {
520 inst0--;
521 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
522 inst0->bits3.if_else.jump_count == 0) {
523 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
524 }
525 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
526 inst0->bits3.if_else.jump_count == 0) {
527 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
528 }
529 }
530 }
531 }
532 break;
533
534 default:
535 generate_vs_instruction(inst, dst, src);
536 break;
537 }
538
539 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
540 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
541 if (0) {
542 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
543 ((uint32_t *)&p->store[i])[3],
544 ((uint32_t *)&p->store[i])[2],
545 ((uint32_t *)&p->store[i])[1],
546 ((uint32_t *)&p->store[i])[0]);
547 }
548 brw_disasm(stdout, &p->store[i], intel->gen);
549 }
550 }
551
552 last_native_inst = p->nr_insn;
553 }
554
555 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
556 printf("\n");
557 }
558
559 ralloc_free(loop_stack);
560 ralloc_free(if_depth_in_loop);
561
562 brw_set_uip_jip(p);
563
564 /* OK, while the INTEL_DEBUG=vs above is very nice for debugging VS
565 * emit issues, it doesn't get the jump distances into the output,
566 * which is often something we want to debug. So this is here in
567 * case you're doing that.
568 */
569 if (0) {
570 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
571 for (unsigned int i = 0; i < p->nr_insn; i++) {
572 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
573 ((uint32_t *)&p->store[i])[3],
574 ((uint32_t *)&p->store[i])[2],
575 ((uint32_t *)&p->store[i])[1],
576 ((uint32_t *)&p->store[i])[0]);
577 brw_disasm(stdout, &p->store[i], intel->gen);
578 }
579 }
580 }
581 }
582
583 extern "C" {
584
585 bool
586 brw_vs_emit(struct brw_vs_compile *c)
587 {
588 struct brw_compile *p = &c->func;
589 struct brw_context *brw = p->brw;
590 struct intel_context *intel = &brw->intel;
591 struct gl_context *ctx = &intel->ctx;
592 struct gl_shader_program *prog = ctx->Shader.CurrentVertexProgram;
593
594 if (!prog)
595 return false;
596
597 struct brw_shader *shader =
598 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
599 if (!shader)
600 return false;
601
602 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
603 printf("GLSL IR for native vertex shader %d:\n", prog->Name);
604 _mesa_print_ir(shader->ir, NULL);
605 printf("\n\n");
606 }
607
608 vec4_visitor v(c, prog, shader);
609 if (!v.run()) {
610 /* FINISHME: Cleanly fail, test at link time, etc. */
611 assert(!"not reached");
612 return false;
613 }
614
615 return true;
616 }
617
618 } /* extern "C" */
619
620 } /* namespace brw */