i965/fs: Add support for compute-to-mrf in 16-wide mode.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 extern "C" {
29
30 #include <sys/types.h>
31
32 #include "main/macros.h"
33 #include "main/shaderobj.h"
34 #include "main/uniforms.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "program/prog_optimize.h"
38 #include "program/register_allocate.h"
39 #include "program/sampler.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 }
45 #include "brw_fs.h"
46 #include "../glsl/glsl_types.h"
47 #include "../glsl/ir_optimization.h"
48 #include "../glsl/ir_print_visitor.h"
49
50 #define MAX_INSTRUCTION (1 << 30)
51 static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53 struct gl_shader *
54 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55 {
56 struct brw_shader *shader;
57
58 shader = rzalloc(NULL, struct brw_shader);
59 if (shader) {
60 shader->base.Type = type;
61 shader->base.Name = name;
62 _mesa_init_shader(ctx, &shader->base);
63 }
64
65 return &shader->base;
66 }
67
68 struct gl_shader_program *
69 brw_new_shader_program(struct gl_context *ctx, GLuint name)
70 {
71 struct brw_shader_program *prog;
72 prog = rzalloc(NULL, struct brw_shader_program);
73 if (prog) {
74 prog->base.Name = name;
75 _mesa_init_shader_program(ctx, &prog->base);
76 }
77 return &prog->base;
78 }
79
80 GLboolean
81 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82 {
83 struct brw_context *brw = brw_context(ctx);
84 struct intel_context *intel = &brw->intel;
85
86 struct brw_shader *shader =
87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88 if (shader != NULL) {
89 void *mem_ctx = ralloc_context(NULL);
90 bool progress;
91
92 if (shader->ir)
93 ralloc_free(shader->ir);
94 shader->ir = new(shader) exec_list;
95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97 do_mat_op_to_vec(shader->ir);
98 lower_instructions(shader->ir,
99 MOD_TO_FRACT |
100 DIV_TO_MUL_RCP |
101 SUB_TO_ADD_NEG |
102 EXP_TO_EXP2 |
103 LOG_TO_LOG2);
104
105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this,
106 * if-statements need to be flattened.
107 */
108 if (intel->gen < 6)
109 lower_if_to_cond_assign(shader->ir, 16);
110
111 do_lower_texture_projection(shader->ir);
112 do_vec_index_to_cond_assign(shader->ir);
113 brw_do_cubemap_normalize(shader->ir);
114 lower_noise(shader->ir);
115 lower_quadop_vector(shader->ir, false);
116 lower_variable_index_to_cond_assign(shader->ir,
117 GL_TRUE, /* input */
118 GL_TRUE, /* output */
119 GL_TRUE, /* temp */
120 GL_TRUE /* uniform */
121 );
122
123 do {
124 progress = false;
125
126 brw_do_channel_expressions(shader->ir);
127 brw_do_vector_splitting(shader->ir);
128
129 progress = do_lower_jumps(shader->ir, true, true,
130 true, /* main return */
131 false, /* continue */
132 false /* loops */
133 ) || progress;
134
135 progress = do_common_optimization(shader->ir, true, 32) || progress;
136 } while (progress);
137
138 validate_ir_tree(shader->ir);
139
140 reparent_ir(shader->ir, shader->ir);
141 ralloc_free(mem_ctx);
142 }
143
144 if (!_mesa_ir_link_shader(ctx, prog))
145 return GL_FALSE;
146
147 return GL_TRUE;
148 }
149
150 static int
151 type_size(const struct glsl_type *type)
152 {
153 unsigned int size, i;
154
155 switch (type->base_type) {
156 case GLSL_TYPE_UINT:
157 case GLSL_TYPE_INT:
158 case GLSL_TYPE_FLOAT:
159 case GLSL_TYPE_BOOL:
160 return type->components();
161 case GLSL_TYPE_ARRAY:
162 return type_size(type->fields.array) * type->length;
163 case GLSL_TYPE_STRUCT:
164 size = 0;
165 for (i = 0; i < type->length; i++) {
166 size += type_size(type->fields.structure[i].type);
167 }
168 return size;
169 case GLSL_TYPE_SAMPLER:
170 /* Samplers take up no register space, since they're baked in at
171 * link time.
172 */
173 return 0;
174 default:
175 assert(!"not reached");
176 return 0;
177 }
178 }
179
180 void
181 fs_visitor::fail(const char *format, ...)
182 {
183 if (!failed) {
184 failed = true;
185
186 if (INTEL_DEBUG & DEBUG_WM) {
187 fprintf(stderr, "FS compile failed: ");
188
189 va_list va;
190 va_start(va, format);
191 vfprintf(stderr, format, va);
192 va_end(va);
193 }
194 }
195 }
196
197 void
198 fs_visitor::push_force_uncompressed()
199 {
200 force_uncompressed_stack++;
201 }
202
203 void
204 fs_visitor::pop_force_uncompressed()
205 {
206 force_uncompressed_stack--;
207 assert(force_uncompressed_stack >= 0);
208 }
209
210 void
211 fs_visitor::push_force_sechalf()
212 {
213 force_sechalf_stack++;
214 }
215
216 void
217 fs_visitor::pop_force_sechalf()
218 {
219 force_sechalf_stack--;
220 assert(force_sechalf_stack >= 0);
221 }
222
223 /**
224 * Returns how many MRFs an FS opcode will write over.
225 *
226 * Note that this is not the 0 or 1 implied writes in an actual gen
227 * instruction -- the FS opcodes often generate MOVs in addition.
228 */
229 int
230 fs_visitor::implied_mrf_writes(fs_inst *inst)
231 {
232 if (inst->mlen == 0)
233 return 0;
234
235 switch (inst->opcode) {
236 case FS_OPCODE_RCP:
237 case FS_OPCODE_RSQ:
238 case FS_OPCODE_SQRT:
239 case FS_OPCODE_EXP2:
240 case FS_OPCODE_LOG2:
241 case FS_OPCODE_SIN:
242 case FS_OPCODE_COS:
243 return 1 * c->dispatch_width / 8;
244 case FS_OPCODE_POW:
245 return 2 * c->dispatch_width / 8;
246 case FS_OPCODE_TEX:
247 case FS_OPCODE_TXB:
248 case FS_OPCODE_TXD:
249 case FS_OPCODE_TXL:
250 return 1;
251 case FS_OPCODE_FB_WRITE:
252 return 2;
253 case FS_OPCODE_PULL_CONSTANT_LOAD:
254 case FS_OPCODE_UNSPILL:
255 return 1;
256 case FS_OPCODE_SPILL:
257 return 2;
258 default:
259 assert(!"not reached");
260 return inst->mlen;
261 }
262 }
263
264 int
265 fs_visitor::virtual_grf_alloc(int size)
266 {
267 if (virtual_grf_array_size <= virtual_grf_next) {
268 if (virtual_grf_array_size == 0)
269 virtual_grf_array_size = 16;
270 else
271 virtual_grf_array_size *= 2;
272 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
273 virtual_grf_array_size);
274
275 /* This slot is always unused. */
276 virtual_grf_sizes[0] = 0;
277 }
278 virtual_grf_sizes[virtual_grf_next] = size;
279 return virtual_grf_next++;
280 }
281
282 /** Fixed HW reg constructor. */
283 fs_reg::fs_reg(enum register_file file, int hw_reg)
284 {
285 init();
286 this->file = file;
287 this->hw_reg = hw_reg;
288 this->type = BRW_REGISTER_TYPE_F;
289 }
290
291 /** Fixed HW reg constructor. */
292 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
293 {
294 init();
295 this->file = file;
296 this->hw_reg = hw_reg;
297 this->type = type;
298 }
299
300 int
301 brw_type_for_base_type(const struct glsl_type *type)
302 {
303 switch (type->base_type) {
304 case GLSL_TYPE_FLOAT:
305 return BRW_REGISTER_TYPE_F;
306 case GLSL_TYPE_INT:
307 case GLSL_TYPE_BOOL:
308 return BRW_REGISTER_TYPE_D;
309 case GLSL_TYPE_UINT:
310 return BRW_REGISTER_TYPE_UD;
311 case GLSL_TYPE_ARRAY:
312 case GLSL_TYPE_STRUCT:
313 case GLSL_TYPE_SAMPLER:
314 /* These should be overridden with the type of the member when
315 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely
316 * way to trip up if we don't.
317 */
318 return BRW_REGISTER_TYPE_UD;
319 default:
320 assert(!"not reached");
321 return BRW_REGISTER_TYPE_F;
322 }
323 }
324
325 /** Automatic reg constructor. */
326 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
327 {
328 init();
329
330 this->file = GRF;
331 this->reg = v->virtual_grf_alloc(type_size(type));
332 this->reg_offset = 0;
333 this->type = brw_type_for_base_type(type);
334 }
335
336 fs_reg *
337 fs_visitor::variable_storage(ir_variable *var)
338 {
339 return (fs_reg *)hash_table_find(this->variable_ht, var);
340 }
341
342 void
343 import_uniforms_callback(const void *key,
344 void *data,
345 void *closure)
346 {
347 struct hash_table *dst_ht = (struct hash_table *)closure;
348 const fs_reg *reg = (const fs_reg *)data;
349
350 if (reg->file != UNIFORM)
351 return;
352
353 hash_table_insert(dst_ht, data, key);
354 }
355
356 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
357 * This brings in those uniform definitions
358 */
359 void
360 fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
361 {
362 hash_table_call_foreach(src_variable_ht,
363 import_uniforms_callback,
364 variable_ht);
365 }
366
367 /* Our support for uniforms is piggy-backed on the struct
368 * gl_fragment_program, because that's where the values actually
369 * get stored, rather than in some global gl_shader_program uniform
370 * store.
371 */
372 int
373 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
374 {
375 unsigned int offset = 0;
376
377 if (type->is_matrix()) {
378 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
379 type->vector_elements,
380 1);
381
382 for (unsigned int i = 0; i < type->matrix_columns; i++) {
383 offset += setup_uniform_values(loc + offset, column);
384 }
385
386 return offset;
387 }
388
389 switch (type->base_type) {
390 case GLSL_TYPE_FLOAT:
391 case GLSL_TYPE_UINT:
392 case GLSL_TYPE_INT:
393 case GLSL_TYPE_BOOL:
394 for (unsigned int i = 0; i < type->vector_elements; i++) {
395 unsigned int param = c->prog_data.nr_params++;
396
397 assert(param < ARRAY_SIZE(c->prog_data.param));
398
399 switch (type->base_type) {
400 case GLSL_TYPE_FLOAT:
401 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
402 break;
403 case GLSL_TYPE_UINT:
404 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
405 break;
406 case GLSL_TYPE_INT:
407 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
408 break;
409 case GLSL_TYPE_BOOL:
410 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
411 break;
412 default:
413 assert(!"not reached");
414 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
415 break;
416 }
417 this->param_index[param] = loc;
418 this->param_offset[param] = i;
419 }
420 return 1;
421
422 case GLSL_TYPE_STRUCT:
423 for (unsigned int i = 0; i < type->length; i++) {
424 offset += setup_uniform_values(loc + offset,
425 type->fields.structure[i].type);
426 }
427 return offset;
428
429 case GLSL_TYPE_ARRAY:
430 for (unsigned int i = 0; i < type->length; i++) {
431 offset += setup_uniform_values(loc + offset, type->fields.array);
432 }
433 return offset;
434
435 case GLSL_TYPE_SAMPLER:
436 /* The sampler takes up a slot, but we don't use any values from it. */
437 return 1;
438
439 default:
440 assert(!"not reached");
441 return 0;
442 }
443 }
444
445
446 /* Our support for builtin uniforms is even scarier than non-builtin.
447 * It sits on top of the PROG_STATE_VAR parameters that are
448 * automatically updated from GL context state.
449 */
450 void
451 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
452 {
453 const ir_state_slot *const slots = ir->state_slots;
454 assert(ir->state_slots != NULL);
455
456 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
457 /* This state reference has already been setup by ir_to_mesa, but we'll
458 * get the same index back here.
459 */
460 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
461 (gl_state_index *)slots[i].tokens);
462
463 /* Add each of the unique swizzles of the element as a parameter.
464 * This'll end up matching the expected layout of the
465 * array/matrix/structure we're trying to fill in.
466 */
467 int last_swiz = -1;
468 for (unsigned int j = 0; j < 4; j++) {
469 int swiz = GET_SWZ(slots[i].swizzle, j);
470 if (swiz == last_swiz)
471 break;
472 last_swiz = swiz;
473
474 c->prog_data.param_convert[c->prog_data.nr_params] =
475 PARAM_NO_CONVERT;
476 this->param_index[c->prog_data.nr_params] = index;
477 this->param_offset[c->prog_data.nr_params] = swiz;
478 c->prog_data.nr_params++;
479 }
480 }
481 }
482
483 fs_reg *
484 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
485 {
486 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
487 fs_reg wpos = *reg;
488 fs_reg neg_y = this->pixel_y;
489 neg_y.negate = true;
490 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
491
492 /* gl_FragCoord.x */
493 if (ir->pixel_center_integer) {
494 emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
495 } else {
496 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
497 }
498 wpos.reg_offset++;
499
500 /* gl_FragCoord.y */
501 if (!flip && ir->pixel_center_integer) {
502 emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
503 } else {
504 fs_reg pixel_y = this->pixel_y;
505 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
506
507 if (flip) {
508 pixel_y.negate = true;
509 offset += c->key.drawable_height - 1.0;
510 }
511
512 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
513 }
514 wpos.reg_offset++;
515
516 /* gl_FragCoord.z */
517 if (intel->gen >= 6) {
518 emit(BRW_OPCODE_MOV, wpos,
519 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
520 } else {
521 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
522 interp_reg(FRAG_ATTRIB_WPOS, 2));
523 }
524 wpos.reg_offset++;
525
526 /* gl_FragCoord.w: Already set up in emit_interpolation */
527 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
528
529 return reg;
530 }
531
532 fs_reg *
533 fs_visitor::emit_general_interpolation(ir_variable *ir)
534 {
535 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
536 /* Interpolation is always in floating point regs. */
537 reg->type = BRW_REGISTER_TYPE_F;
538 fs_reg attr = *reg;
539
540 unsigned int array_elements;
541 const glsl_type *type;
542
543 if (ir->type->is_array()) {
544 array_elements = ir->type->length;
545 if (array_elements == 0) {
546 fail("dereferenced array '%s' has length 0\n", ir->name);
547 }
548 type = ir->type->fields.array;
549 } else {
550 array_elements = 1;
551 type = ir->type;
552 }
553
554 int location = ir->location;
555 for (unsigned int i = 0; i < array_elements; i++) {
556 for (unsigned int j = 0; j < type->matrix_columns; j++) {
557 if (urb_setup[location] == -1) {
558 /* If there's no incoming setup data for this slot, don't
559 * emit interpolation for it.
560 */
561 attr.reg_offset += type->vector_elements;
562 location++;
563 continue;
564 }
565
566 bool is_gl_Color =
567 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
568
569 if (c->key.flat_shade && is_gl_Color) {
570 /* Constant interpolation (flat shading) case. The SF has
571 * handed us defined values in only the constant offset
572 * field of the setup reg.
573 */
574 for (unsigned int k = 0; k < type->vector_elements; k++) {
575 struct brw_reg interp = interp_reg(location, k);
576 interp = suboffset(interp, 3);
577 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
578 attr.reg_offset++;
579 }
580 } else {
581 /* Perspective interpolation case. */
582 for (unsigned int k = 0; k < type->vector_elements; k++) {
583 struct brw_reg interp = interp_reg(location, k);
584 emit(FS_OPCODE_LINTERP, attr,
585 this->delta_x, this->delta_y, fs_reg(interp));
586 attr.reg_offset++;
587 }
588
589 if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
590 attr.reg_offset -= type->vector_elements;
591 for (unsigned int k = 0; k < type->vector_elements; k++) {
592 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
593 attr.reg_offset++;
594 }
595 }
596 }
597 location++;
598 }
599 }
600
601 return reg;
602 }
603
604 fs_reg *
605 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
606 {
607 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
608
609 /* The frontfacing comes in as a bit in the thread payload. */
610 if (intel->gen >= 6) {
611 emit(BRW_OPCODE_ASR, *reg,
612 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
613 fs_reg(15));
614 emit(BRW_OPCODE_NOT, *reg, *reg);
615 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
616 } else {
617 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
618 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
619 * us front face
620 */
621 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
622 fs_reg(r1_6ud),
623 fs_reg(1u << 31));
624 inst->conditional_mod = BRW_CONDITIONAL_L;
625 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
626 }
627
628 return reg;
629 }
630
631 fs_inst *
632 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
633 {
634 switch (opcode) {
635 case FS_OPCODE_RCP:
636 case FS_OPCODE_RSQ:
637 case FS_OPCODE_SQRT:
638 case FS_OPCODE_EXP2:
639 case FS_OPCODE_LOG2:
640 case FS_OPCODE_SIN:
641 case FS_OPCODE_COS:
642 break;
643 default:
644 assert(!"not reached: bad math opcode");
645 return NULL;
646 }
647
648 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
649 * might be able to do better by doing execsize = 1 math and then
650 * expanding that result out, but we would need to be careful with
651 * masking.
652 *
653 * The hardware ignores source modifiers (negate and abs) on math
654 * instructions, so we also move to a temp to set those up.
655 */
656 if (intel->gen >= 6 && (src.file == UNIFORM ||
657 src.abs ||
658 src.negate)) {
659 fs_reg expanded = fs_reg(this, glsl_type::float_type);
660 emit(BRW_OPCODE_MOV, expanded, src);
661 src = expanded;
662 }
663
664 fs_inst *inst = emit(opcode, dst, src);
665
666 if (intel->gen < 6) {
667 inst->base_mrf = 2;
668 inst->mlen = c->dispatch_width / 8;
669 }
670
671 return inst;
672 }
673
674 fs_inst *
675 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
676 {
677 int base_mrf = 2;
678 fs_inst *inst;
679
680 assert(opcode == FS_OPCODE_POW);
681
682 if (intel->gen >= 6) {
683 /* Can't do hstride == 0 args to gen6 math, so expand it out.
684 *
685 * The hardware ignores source modifiers (negate and abs) on math
686 * instructions, so we also move to a temp to set those up.
687 */
688 if (src0.file == UNIFORM || src0.abs || src0.negate) {
689 fs_reg expanded = fs_reg(this, glsl_type::float_type);
690 emit(BRW_OPCODE_MOV, expanded, src0);
691 src0 = expanded;
692 }
693
694 if (src1.file == UNIFORM || src1.abs || src1.negate) {
695 fs_reg expanded = fs_reg(this, glsl_type::float_type);
696 emit(BRW_OPCODE_MOV, expanded, src1);
697 src1 = expanded;
698 }
699
700 inst = emit(opcode, dst, src0, src1);
701 } else {
702 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
703 inst = emit(opcode, dst, src0, reg_null_f);
704
705 inst->base_mrf = base_mrf;
706 inst->mlen = 2 * c->dispatch_width / 8;
707 }
708 return inst;
709 }
710
711 void
712 fs_visitor::visit(ir_variable *ir)
713 {
714 fs_reg *reg = NULL;
715
716 if (variable_storage(ir))
717 return;
718
719 if (strcmp(ir->name, "gl_FragColor") == 0) {
720 this->frag_color = ir;
721 } else if (strcmp(ir->name, "gl_FragData") == 0) {
722 this->frag_data = ir;
723 } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
724 this->frag_depth = ir;
725 }
726
727 if (ir->mode == ir_var_in) {
728 if (!strcmp(ir->name, "gl_FragCoord")) {
729 reg = emit_fragcoord_interpolation(ir);
730 } else if (!strcmp(ir->name, "gl_FrontFacing")) {
731 reg = emit_frontfacing_interpolation(ir);
732 } else {
733 reg = emit_general_interpolation(ir);
734 }
735 assert(reg);
736 hash_table_insert(this->variable_ht, reg, ir);
737 return;
738 }
739
740 if (ir->mode == ir_var_uniform) {
741 int param_index = c->prog_data.nr_params;
742
743 if (c->dispatch_width == 16) {
744 if (!variable_storage(ir)) {
745 fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
746 }
747 return;
748 }
749
750 if (!strncmp(ir->name, "gl_", 3)) {
751 setup_builtin_uniform_values(ir);
752 } else {
753 setup_uniform_values(ir->location, ir->type);
754 }
755
756 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
757 reg->type = brw_type_for_base_type(ir->type);
758 }
759
760 if (!reg)
761 reg = new(this->mem_ctx) fs_reg(this, ir->type);
762
763 hash_table_insert(this->variable_ht, reg, ir);
764 }
765
766 void
767 fs_visitor::visit(ir_dereference_variable *ir)
768 {
769 fs_reg *reg = variable_storage(ir->var);
770 this->result = *reg;
771 }
772
773 void
774 fs_visitor::visit(ir_dereference_record *ir)
775 {
776 const glsl_type *struct_type = ir->record->type;
777
778 ir->record->accept(this);
779
780 unsigned int offset = 0;
781 for (unsigned int i = 0; i < struct_type->length; i++) {
782 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
783 break;
784 offset += type_size(struct_type->fields.structure[i].type);
785 }
786 this->result.reg_offset += offset;
787 this->result.type = brw_type_for_base_type(ir->type);
788 }
789
790 void
791 fs_visitor::visit(ir_dereference_array *ir)
792 {
793 ir_constant *index;
794 int element_size;
795
796 ir->array->accept(this);
797 index = ir->array_index->as_constant();
798
799 element_size = type_size(ir->type);
800 this->result.type = brw_type_for_base_type(ir->type);
801
802 if (index) {
803 assert(this->result.file == UNIFORM ||
804 (this->result.file == GRF &&
805 this->result.reg != 0));
806 this->result.reg_offset += index->value.i[0] * element_size;
807 } else {
808 assert(!"FINISHME: non-constant array element");
809 }
810 }
811
812 /* Instruction selection: Produce a MOV.sat instead of
813 * MIN(MAX(val, 0), 1) when possible.
814 */
815 bool
816 fs_visitor::try_emit_saturate(ir_expression *ir)
817 {
818 ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
819
820 if (!sat_val)
821 return false;
822
823 sat_val->accept(this);
824 fs_reg src = this->result;
825
826 this->result = fs_reg(this, ir->type);
827 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
828 inst->saturate = true;
829
830 return true;
831 }
832
833 static uint32_t
834 brw_conditional_for_comparison(unsigned int op)
835 {
836 switch (op) {
837 case ir_binop_less:
838 return BRW_CONDITIONAL_L;
839 case ir_binop_greater:
840 return BRW_CONDITIONAL_G;
841 case ir_binop_lequal:
842 return BRW_CONDITIONAL_LE;
843 case ir_binop_gequal:
844 return BRW_CONDITIONAL_GE;
845 case ir_binop_equal:
846 case ir_binop_all_equal: /* same as equal for scalars */
847 return BRW_CONDITIONAL_Z;
848 case ir_binop_nequal:
849 case ir_binop_any_nequal: /* same as nequal for scalars */
850 return BRW_CONDITIONAL_NZ;
851 default:
852 assert(!"not reached: bad operation for comparison");
853 return BRW_CONDITIONAL_NZ;
854 }
855 }
856
857 void
858 fs_visitor::visit(ir_expression *ir)
859 {
860 unsigned int operand;
861 fs_reg op[2], temp;
862 fs_inst *inst;
863
864 assert(ir->get_num_operands() <= 2);
865
866 if (try_emit_saturate(ir))
867 return;
868
869 for (operand = 0; operand < ir->get_num_operands(); operand++) {
870 ir->operands[operand]->accept(this);
871 if (this->result.file == BAD_FILE) {
872 ir_print_visitor v;
873 fail("Failed to get tree for expression operand:\n");
874 ir->operands[operand]->accept(&v);
875 }
876 op[operand] = this->result;
877
878 /* Matrix expression operands should have been broken down to vector
879 * operations already.
880 */
881 assert(!ir->operands[operand]->type->is_matrix());
882 /* And then those vector operands should have been broken down to scalar.
883 */
884 assert(!ir->operands[operand]->type->is_vector());
885 }
886
887 /* Storage for our result. If our result goes into an assignment, it will
888 * just get copy-propagated out, so no worries.
889 */
890 this->result = fs_reg(this, ir->type);
891
892 switch (ir->operation) {
893 case ir_unop_logic_not:
894 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
895 * ones complement of the whole register, not just bit 0.
896 */
897 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
898 break;
899 case ir_unop_neg:
900 op[0].negate = !op[0].negate;
901 this->result = op[0];
902 break;
903 case ir_unop_abs:
904 op[0].abs = true;
905 op[0].negate = false;
906 this->result = op[0];
907 break;
908 case ir_unop_sign:
909 temp = fs_reg(this, ir->type);
910
911 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
912
913 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
914 inst->conditional_mod = BRW_CONDITIONAL_G;
915 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
916 inst->predicated = true;
917
918 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
919 inst->conditional_mod = BRW_CONDITIONAL_L;
920 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
921 inst->predicated = true;
922
923 break;
924 case ir_unop_rcp:
925 emit_math(FS_OPCODE_RCP, this->result, op[0]);
926 break;
927
928 case ir_unop_exp2:
929 emit_math(FS_OPCODE_EXP2, this->result, op[0]);
930 break;
931 case ir_unop_log2:
932 emit_math(FS_OPCODE_LOG2, this->result, op[0]);
933 break;
934 case ir_unop_exp:
935 case ir_unop_log:
936 assert(!"not reached: should be handled by ir_explog_to_explog2");
937 break;
938 case ir_unop_sin:
939 case ir_unop_sin_reduced:
940 emit_math(FS_OPCODE_SIN, this->result, op[0]);
941 break;
942 case ir_unop_cos:
943 case ir_unop_cos_reduced:
944 emit_math(FS_OPCODE_COS, this->result, op[0]);
945 break;
946
947 case ir_unop_dFdx:
948 emit(FS_OPCODE_DDX, this->result, op[0]);
949 break;
950 case ir_unop_dFdy:
951 emit(FS_OPCODE_DDY, this->result, op[0]);
952 break;
953
954 case ir_binop_add:
955 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
956 break;
957 case ir_binop_sub:
958 assert(!"not reached: should be handled by ir_sub_to_add_neg");
959 break;
960
961 case ir_binop_mul:
962 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
963 break;
964 case ir_binop_div:
965 assert(!"not reached: should be handled by ir_div_to_mul_rcp");
966 break;
967 case ir_binop_mod:
968 assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
969 break;
970
971 case ir_binop_less:
972 case ir_binop_greater:
973 case ir_binop_lequal:
974 case ir_binop_gequal:
975 case ir_binop_equal:
976 case ir_binop_all_equal:
977 case ir_binop_nequal:
978 case ir_binop_any_nequal:
979 temp = this->result;
980 /* original gen4 does implicit conversion before comparison. */
981 if (intel->gen < 5)
982 temp.type = op[0].type;
983
984 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
985 inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
986 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
987 break;
988
989 case ir_binop_logic_xor:
990 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
991 break;
992
993 case ir_binop_logic_or:
994 emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
995 break;
996
997 case ir_binop_logic_and:
998 emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
999 break;
1000
1001 case ir_binop_dot:
1002 case ir_unop_any:
1003 assert(!"not reached: should be handled by brw_fs_channel_expressions");
1004 break;
1005
1006 case ir_unop_noise:
1007 assert(!"not reached: should be handled by lower_noise");
1008 break;
1009
1010 case ir_quadop_vector:
1011 assert(!"not reached: should be handled by lower_quadop_vector");
1012 break;
1013
1014 case ir_unop_sqrt:
1015 emit_math(FS_OPCODE_SQRT, this->result, op[0]);
1016 break;
1017
1018 case ir_unop_rsq:
1019 emit_math(FS_OPCODE_RSQ, this->result, op[0]);
1020 break;
1021
1022 case ir_unop_i2f:
1023 case ir_unop_b2f:
1024 case ir_unop_b2i:
1025 case ir_unop_f2i:
1026 emit(BRW_OPCODE_MOV, this->result, op[0]);
1027 break;
1028 case ir_unop_f2b:
1029 case ir_unop_i2b:
1030 temp = this->result;
1031 /* original gen4 does implicit conversion before comparison. */
1032 if (intel->gen < 5)
1033 temp.type = op[0].type;
1034
1035 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
1036 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1037 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
1038 break;
1039
1040 case ir_unop_trunc:
1041 emit(BRW_OPCODE_RNDZ, this->result, op[0]);
1042 break;
1043 case ir_unop_ceil:
1044 op[0].negate = !op[0].negate;
1045 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1046 this->result.negate = true;
1047 break;
1048 case ir_unop_floor:
1049 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1050 break;
1051 case ir_unop_fract:
1052 inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
1053 break;
1054 case ir_unop_round_even:
1055 emit(BRW_OPCODE_RNDE, this->result, op[0]);
1056 break;
1057
1058 case ir_binop_min:
1059 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1060 inst->conditional_mod = BRW_CONDITIONAL_L;
1061
1062 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1063 inst->predicated = true;
1064 break;
1065 case ir_binop_max:
1066 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1067 inst->conditional_mod = BRW_CONDITIONAL_G;
1068
1069 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1070 inst->predicated = true;
1071 break;
1072
1073 case ir_binop_pow:
1074 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1075 break;
1076
1077 case ir_unop_bit_not:
1078 inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1079 break;
1080 case ir_binop_bit_and:
1081 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1082 break;
1083 case ir_binop_bit_xor:
1084 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1085 break;
1086 case ir_binop_bit_or:
1087 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1088 break;
1089
1090 case ir_unop_u2f:
1091 case ir_binop_lshift:
1092 case ir_binop_rshift:
1093 assert(!"GLSL 1.30 features unsupported");
1094 break;
1095 }
1096 }
1097
1098 void
1099 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1100 const glsl_type *type, bool predicated)
1101 {
1102 switch (type->base_type) {
1103 case GLSL_TYPE_FLOAT:
1104 case GLSL_TYPE_UINT:
1105 case GLSL_TYPE_INT:
1106 case GLSL_TYPE_BOOL:
1107 for (unsigned int i = 0; i < type->components(); i++) {
1108 l.type = brw_type_for_base_type(type);
1109 r.type = brw_type_for_base_type(type);
1110
1111 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1112 inst->predicated = predicated;
1113
1114 l.reg_offset++;
1115 r.reg_offset++;
1116 }
1117 break;
1118 case GLSL_TYPE_ARRAY:
1119 for (unsigned int i = 0; i < type->length; i++) {
1120 emit_assignment_writes(l, r, type->fields.array, predicated);
1121 }
1122 break;
1123
1124 case GLSL_TYPE_STRUCT:
1125 for (unsigned int i = 0; i < type->length; i++) {
1126 emit_assignment_writes(l, r, type->fields.structure[i].type,
1127 predicated);
1128 }
1129 break;
1130
1131 case GLSL_TYPE_SAMPLER:
1132 break;
1133
1134 default:
1135 assert(!"not reached");
1136 break;
1137 }
1138 }
1139
1140 void
1141 fs_visitor::visit(ir_assignment *ir)
1142 {
1143 struct fs_reg l, r;
1144 fs_inst *inst;
1145
1146 /* FINISHME: arrays on the lhs */
1147 ir->lhs->accept(this);
1148 l = this->result;
1149
1150 ir->rhs->accept(this);
1151 r = this->result;
1152
1153 assert(l.file != BAD_FILE);
1154 assert(r.file != BAD_FILE);
1155
1156 if (ir->condition) {
1157 emit_bool_to_cond_code(ir->condition);
1158 }
1159
1160 if (ir->lhs->type->is_scalar() ||
1161 ir->lhs->type->is_vector()) {
1162 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1163 if (ir->write_mask & (1 << i)) {
1164 inst = emit(BRW_OPCODE_MOV, l, r);
1165 if (ir->condition)
1166 inst->predicated = true;
1167 r.reg_offset++;
1168 }
1169 l.reg_offset++;
1170 }
1171 } else {
1172 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1173 }
1174 }
1175
1176 fs_inst *
1177 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1178 {
1179 int mlen;
1180 int base_mrf = 1;
1181 bool simd16 = false;
1182 fs_reg orig_dst;
1183
1184 /* g0 header. */
1185 mlen = 1;
1186
1187 if (ir->shadow_comparitor) {
1188 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1189 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1190 coordinate.reg_offset++;
1191 }
1192 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1193 mlen += 3;
1194
1195 if (ir->op == ir_tex) {
1196 /* There's no plain shadow compare message, so we use shadow
1197 * compare with a bias of 0.0.
1198 */
1199 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1200 mlen++;
1201 } else if (ir->op == ir_txb) {
1202 ir->lod_info.bias->accept(this);
1203 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1204 mlen++;
1205 } else {
1206 assert(ir->op == ir_txl);
1207 ir->lod_info.lod->accept(this);
1208 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1209 mlen++;
1210 }
1211
1212 ir->shadow_comparitor->accept(this);
1213 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1214 mlen++;
1215 } else if (ir->op == ir_tex) {
1216 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1217 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1218 coordinate.reg_offset++;
1219 }
1220 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1221 mlen += 3;
1222 } else if (ir->op == ir_txd) {
1223 assert(!"TXD isn't supported on gen4 yet.");
1224 } else {
1225 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1226 * instructions. We'll need to do SIMD16 here.
1227 */
1228 assert(ir->op == ir_txb || ir->op == ir_txl);
1229
1230 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1231 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1232 coordinate.reg_offset++;
1233 }
1234
1235 /* lod/bias appears after u/v/r. */
1236 mlen += 6;
1237
1238 if (ir->op == ir_txb) {
1239 ir->lod_info.bias->accept(this);
1240 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1241 mlen++;
1242 } else {
1243 ir->lod_info.lod->accept(this);
1244 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1245 mlen++;
1246 }
1247
1248 /* The unused upper half. */
1249 mlen++;
1250
1251 /* Now, since we're doing simd16, the return is 2 interleaved
1252 * vec4s where the odd-indexed ones are junk. We'll need to move
1253 * this weirdness around to the expected layout.
1254 */
1255 simd16 = true;
1256 orig_dst = dst;
1257 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1258 2));
1259 dst.type = BRW_REGISTER_TYPE_F;
1260 }
1261
1262 fs_inst *inst = NULL;
1263 switch (ir->op) {
1264 case ir_tex:
1265 inst = emit(FS_OPCODE_TEX, dst);
1266 break;
1267 case ir_txb:
1268 inst = emit(FS_OPCODE_TXB, dst);
1269 break;
1270 case ir_txl:
1271 inst = emit(FS_OPCODE_TXL, dst);
1272 break;
1273 case ir_txd:
1274 inst = emit(FS_OPCODE_TXD, dst);
1275 break;
1276 case ir_txf:
1277 assert(!"GLSL 1.30 features unsupported");
1278 break;
1279 }
1280 inst->base_mrf = base_mrf;
1281 inst->mlen = mlen;
1282
1283 if (simd16) {
1284 for (int i = 0; i < 4; i++) {
1285 emit(BRW_OPCODE_MOV, orig_dst, dst);
1286 orig_dst.reg_offset++;
1287 dst.reg_offset += 2;
1288 }
1289 }
1290
1291 return inst;
1292 }
1293
1294 /* gen5's sampler has slots for u, v, r, array index, then optional
1295 * parameters like shadow comparitor or LOD bias. If optional
1296 * parameters aren't present, those base slots are optional and don't
1297 * need to be included in the message.
1298 *
1299 * We don't fill in the unnecessary slots regardless, which may look
1300 * surprising in the disassembly.
1301 */
1302 fs_inst *
1303 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1304 {
1305 int mlen = 1; /* g0 header always present. */
1306 int base_mrf = 1;
1307 int reg_width = c->dispatch_width / 8;
1308
1309 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1310 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * reg_width),
1311 coordinate);
1312 coordinate.reg_offset++;
1313 }
1314 mlen += ir->coordinate->type->vector_elements * reg_width;
1315
1316 if (ir->shadow_comparitor) {
1317 mlen = MAX2(mlen, 1 + 4 * reg_width);
1318
1319 ir->shadow_comparitor->accept(this);
1320 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1321 mlen += reg_width;
1322 }
1323
1324 fs_inst *inst = NULL;
1325 switch (ir->op) {
1326 case ir_tex:
1327 inst = emit(FS_OPCODE_TEX, dst);
1328 break;
1329 case ir_txb:
1330 ir->lod_info.bias->accept(this);
1331 mlen = MAX2(mlen, 1 + 4 * reg_width);
1332 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1333 mlen += reg_width;
1334
1335 inst = emit(FS_OPCODE_TXB, dst);
1336
1337 break;
1338 case ir_txl:
1339 ir->lod_info.lod->accept(this);
1340 mlen = MAX2(mlen, 1 + 4 * reg_width);
1341 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1342 mlen += reg_width;
1343
1344 inst = emit(FS_OPCODE_TXL, dst);
1345 break;
1346 case ir_txd:
1347 case ir_txf:
1348 assert(!"GLSL 1.30 features unsupported");
1349 break;
1350 }
1351 inst->base_mrf = base_mrf;
1352 inst->mlen = mlen;
1353
1354 if (mlen > 11) {
1355 fail("Message length >11 disallowed by hardware\n");
1356 }
1357
1358 return inst;
1359 }
1360
1361 void
1362 fs_visitor::visit(ir_texture *ir)
1363 {
1364 int sampler;
1365 fs_inst *inst = NULL;
1366
1367 ir->coordinate->accept(this);
1368 fs_reg coordinate = this->result;
1369
1370 if (ir->offset != NULL) {
1371 ir_constant *offset = ir->offset->as_constant();
1372 assert(offset != NULL);
1373
1374 signed char offsets[3];
1375 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1376 offsets[i] = (signed char) offset->value.i[i];
1377
1378 /* Combine all three offsets into a single unsigned dword:
1379 *
1380 * bits 11:8 - U Offset (X component)
1381 * bits 7:4 - V Offset (Y component)
1382 * bits 3:0 - R Offset (Z component)
1383 */
1384 unsigned offset_bits = 0;
1385 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1386 const unsigned shift = 4 * (2 - i);
1387 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1388 }
1389
1390 /* Explicitly set up the message header by copying g0 to msg reg m1. */
1391 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1392 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1393
1394 /* Then set the offset bits in DWord 2 of the message header. */
1395 emit(BRW_OPCODE_MOV,
1396 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1397 BRW_REGISTER_TYPE_UD)),
1398 fs_reg(brw_imm_uw(offset_bits)));
1399 }
1400
1401 /* Should be lowered by do_lower_texture_projection */
1402 assert(!ir->projector);
1403
1404 sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1405 ctx->Shader.CurrentFragmentProgram,
1406 &brw->fragment_program->Base);
1407 sampler = c->fp->program.Base.SamplerUnits[sampler];
1408
1409 /* The 965 requires the EU to do the normalization of GL rectangle
1410 * texture coordinates. We use the program parameter state
1411 * tracking to get the scaling factor.
1412 */
1413 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1414 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1415 int tokens[STATE_LENGTH] = {
1416 STATE_INTERNAL,
1417 STATE_TEXRECT_SCALE,
1418 sampler,
1419 0,
1420 0
1421 };
1422
1423 if (c->dispatch_width == 16) {
1424 fail("rectangle scale uniform setup not supported on 16-wide\n");
1425 this->result = fs_reg(this, ir->type);
1426 return;
1427 }
1428
1429 c->prog_data.param_convert[c->prog_data.nr_params] =
1430 PARAM_NO_CONVERT;
1431 c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1432 PARAM_NO_CONVERT;
1433
1434 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1435 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1436 GLuint index = _mesa_add_state_reference(params,
1437 (gl_state_index *)tokens);
1438
1439 this->param_index[c->prog_data.nr_params] = index;
1440 this->param_offset[c->prog_data.nr_params] = 0;
1441 c->prog_data.nr_params++;
1442 this->param_index[c->prog_data.nr_params] = index;
1443 this->param_offset[c->prog_data.nr_params] = 1;
1444 c->prog_data.nr_params++;
1445
1446 fs_reg dst = fs_reg(this, ir->coordinate->type);
1447 fs_reg src = coordinate;
1448 coordinate = dst;
1449
1450 emit(BRW_OPCODE_MUL, dst, src, scale_x);
1451 dst.reg_offset++;
1452 src.reg_offset++;
1453 emit(BRW_OPCODE_MUL, dst, src, scale_y);
1454 }
1455
1456 /* Writemasking doesn't eliminate channels on SIMD8 texture
1457 * samples, so don't worry about them.
1458 */
1459 fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1460
1461 if (intel->gen < 5) {
1462 inst = emit_texture_gen4(ir, dst, coordinate);
1463 } else {
1464 inst = emit_texture_gen5(ir, dst, coordinate);
1465 }
1466
1467 /* If there's an offset, we already set up m1. To avoid the implied move,
1468 * use the null register. Otherwise, we want an implied move from g0.
1469 */
1470 if (ir->offset != NULL)
1471 inst->src[0] = fs_reg(brw_null_reg());
1472 else
1473 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1474
1475 inst->sampler = sampler;
1476
1477 this->result = dst;
1478
1479 if (ir->shadow_comparitor)
1480 inst->shadow_compare = true;
1481
1482 if (ir->type == glsl_type::float_type) {
1483 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1484 assert(ir->sampler->type->sampler_shadow);
1485 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1486 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1487
1488 for (int i = 0; i < 4; i++) {
1489 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1490 fs_reg l = swizzle_dst;
1491 l.reg_offset += i;
1492
1493 if (swiz == SWIZZLE_ZERO) {
1494 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1495 } else if (swiz == SWIZZLE_ONE) {
1496 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1497 } else {
1498 fs_reg r = dst;
1499 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1500 emit(BRW_OPCODE_MOV, l, r);
1501 }
1502 }
1503 this->result = swizzle_dst;
1504 }
1505 }
1506
1507 void
1508 fs_visitor::visit(ir_swizzle *ir)
1509 {
1510 ir->val->accept(this);
1511 fs_reg val = this->result;
1512
1513 if (ir->type->vector_elements == 1) {
1514 this->result.reg_offset += ir->mask.x;
1515 return;
1516 }
1517
1518 fs_reg result = fs_reg(this, ir->type);
1519 this->result = result;
1520
1521 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1522 fs_reg channel = val;
1523 int swiz = 0;
1524
1525 switch (i) {
1526 case 0:
1527 swiz = ir->mask.x;
1528 break;
1529 case 1:
1530 swiz = ir->mask.y;
1531 break;
1532 case 2:
1533 swiz = ir->mask.z;
1534 break;
1535 case 3:
1536 swiz = ir->mask.w;
1537 break;
1538 }
1539
1540 channel.reg_offset += swiz;
1541 emit(BRW_OPCODE_MOV, result, channel);
1542 result.reg_offset++;
1543 }
1544 }
1545
1546 void
1547 fs_visitor::visit(ir_discard *ir)
1548 {
1549 fs_reg temp = fs_reg(this, glsl_type::uint_type);
1550
1551 assert(ir->condition == NULL); /* FINISHME */
1552
1553 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1554 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1555 kill_emitted = true;
1556 }
1557
1558 void
1559 fs_visitor::visit(ir_constant *ir)
1560 {
1561 /* Set this->result to reg at the bottom of the function because some code
1562 * paths will cause this visitor to be applied to other fields. This will
1563 * cause the value stored in this->result to be modified.
1564 *
1565 * Make reg constant so that it doesn't get accidentally modified along the
1566 * way. Yes, I actually had this problem. :(
1567 */
1568 const fs_reg reg(this, ir->type);
1569 fs_reg dst_reg = reg;
1570
1571 if (ir->type->is_array()) {
1572 const unsigned size = type_size(ir->type->fields.array);
1573
1574 for (unsigned i = 0; i < ir->type->length; i++) {
1575 ir->array_elements[i]->accept(this);
1576 fs_reg src_reg = this->result;
1577
1578 dst_reg.type = src_reg.type;
1579 for (unsigned j = 0; j < size; j++) {
1580 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1581 src_reg.reg_offset++;
1582 dst_reg.reg_offset++;
1583 }
1584 }
1585 } else if (ir->type->is_record()) {
1586 foreach_list(node, &ir->components) {
1587 ir_instruction *const field = (ir_instruction *) node;
1588 const unsigned size = type_size(field->type);
1589
1590 field->accept(this);
1591 fs_reg src_reg = this->result;
1592
1593 dst_reg.type = src_reg.type;
1594 for (unsigned j = 0; j < size; j++) {
1595 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1596 src_reg.reg_offset++;
1597 dst_reg.reg_offset++;
1598 }
1599 }
1600 } else {
1601 const unsigned size = type_size(ir->type);
1602
1603 for (unsigned i = 0; i < size; i++) {
1604 switch (ir->type->base_type) {
1605 case GLSL_TYPE_FLOAT:
1606 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1607 break;
1608 case GLSL_TYPE_UINT:
1609 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1610 break;
1611 case GLSL_TYPE_INT:
1612 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1613 break;
1614 case GLSL_TYPE_BOOL:
1615 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1616 break;
1617 default:
1618 assert(!"Non-float/uint/int/bool constant");
1619 }
1620 dst_reg.reg_offset++;
1621 }
1622 }
1623
1624 this->result = reg;
1625 }
1626
1627 void
1628 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1629 {
1630 ir_expression *expr = ir->as_expression();
1631
1632 if (expr) {
1633 fs_reg op[2];
1634 fs_inst *inst;
1635
1636 assert(expr->get_num_operands() <= 2);
1637 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1638 assert(expr->operands[i]->type->is_scalar());
1639
1640 expr->operands[i]->accept(this);
1641 op[i] = this->result;
1642 }
1643
1644 switch (expr->operation) {
1645 case ir_unop_logic_not:
1646 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1647 inst->conditional_mod = BRW_CONDITIONAL_Z;
1648 break;
1649
1650 case ir_binop_logic_xor:
1651 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1652 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1653 break;
1654
1655 case ir_binop_logic_or:
1656 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1657 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1658 break;
1659
1660 case ir_binop_logic_and:
1661 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1662 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1663 break;
1664
1665 case ir_unop_f2b:
1666 if (intel->gen >= 6) {
1667 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1668 } else {
1669 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1670 }
1671 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1672 break;
1673
1674 case ir_unop_i2b:
1675 if (intel->gen >= 6) {
1676 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1677 } else {
1678 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1679 }
1680 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1681 break;
1682
1683 case ir_binop_greater:
1684 case ir_binop_gequal:
1685 case ir_binop_less:
1686 case ir_binop_lequal:
1687 case ir_binop_equal:
1688 case ir_binop_all_equal:
1689 case ir_binop_nequal:
1690 case ir_binop_any_nequal:
1691 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1692 inst->conditional_mod =
1693 brw_conditional_for_comparison(expr->operation);
1694 break;
1695
1696 default:
1697 assert(!"not reached");
1698 fail("bad cond code\n");
1699 break;
1700 }
1701 return;
1702 }
1703
1704 ir->accept(this);
1705
1706 if (intel->gen >= 6) {
1707 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1708 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1709 } else {
1710 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1711 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1712 }
1713 }
1714
1715 /**
1716 * Emit a gen6 IF statement with the comparison folded into the IF
1717 * instruction.
1718 */
1719 void
1720 fs_visitor::emit_if_gen6(ir_if *ir)
1721 {
1722 ir_expression *expr = ir->condition->as_expression();
1723
1724 if (expr) {
1725 fs_reg op[2];
1726 fs_inst *inst;
1727 fs_reg temp;
1728
1729 assert(expr->get_num_operands() <= 2);
1730 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1731 assert(expr->operands[i]->type->is_scalar());
1732
1733 expr->operands[i]->accept(this);
1734 op[i] = this->result;
1735 }
1736
1737 switch (expr->operation) {
1738 case ir_unop_logic_not:
1739 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1740 inst->conditional_mod = BRW_CONDITIONAL_Z;
1741 return;
1742
1743 case ir_binop_logic_xor:
1744 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1745 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1746 return;
1747
1748 case ir_binop_logic_or:
1749 temp = fs_reg(this, glsl_type::bool_type);
1750 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1751 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1752 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1753 return;
1754
1755 case ir_binop_logic_and:
1756 temp = fs_reg(this, glsl_type::bool_type);
1757 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1758 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1759 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1760 return;
1761
1762 case ir_unop_f2b:
1763 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1764 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1765 return;
1766
1767 case ir_unop_i2b:
1768 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1769 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1770 return;
1771
1772 case ir_binop_greater:
1773 case ir_binop_gequal:
1774 case ir_binop_less:
1775 case ir_binop_lequal:
1776 case ir_binop_equal:
1777 case ir_binop_all_equal:
1778 case ir_binop_nequal:
1779 case ir_binop_any_nequal:
1780 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1781 inst->conditional_mod =
1782 brw_conditional_for_comparison(expr->operation);
1783 return;
1784 default:
1785 assert(!"not reached");
1786 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1787 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1788 fail("bad condition\n");
1789 return;
1790 }
1791 return;
1792 }
1793
1794 ir->condition->accept(this);
1795
1796 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1797 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1798 }
1799
1800 void
1801 fs_visitor::visit(ir_if *ir)
1802 {
1803 fs_inst *inst;
1804
1805 if (c->dispatch_width == 16) {
1806 fail("Can't support (non-uniform) control flow on 16-wide\n");
1807 }
1808
1809 /* Don't point the annotation at the if statement, because then it plus
1810 * the then and else blocks get printed.
1811 */
1812 this->base_ir = ir->condition;
1813
1814 if (intel->gen >= 6) {
1815 emit_if_gen6(ir);
1816 } else {
1817 emit_bool_to_cond_code(ir->condition);
1818
1819 inst = emit(BRW_OPCODE_IF);
1820 inst->predicated = true;
1821 }
1822
1823 foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1824 ir_instruction *ir = (ir_instruction *)iter.get();
1825 this->base_ir = ir;
1826
1827 ir->accept(this);
1828 }
1829
1830 if (!ir->else_instructions.is_empty()) {
1831 emit(BRW_OPCODE_ELSE);
1832
1833 foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1834 ir_instruction *ir = (ir_instruction *)iter.get();
1835 this->base_ir = ir;
1836
1837 ir->accept(this);
1838 }
1839 }
1840
1841 emit(BRW_OPCODE_ENDIF);
1842 }
1843
1844 void
1845 fs_visitor::visit(ir_loop *ir)
1846 {
1847 fs_reg counter = reg_undef;
1848
1849 if (c->dispatch_width == 16) {
1850 fail("Can't support (non-uniform) control flow on 16-wide\n");
1851 }
1852
1853 if (ir->counter) {
1854 this->base_ir = ir->counter;
1855 ir->counter->accept(this);
1856 counter = *(variable_storage(ir->counter));
1857
1858 if (ir->from) {
1859 this->base_ir = ir->from;
1860 ir->from->accept(this);
1861
1862 emit(BRW_OPCODE_MOV, counter, this->result);
1863 }
1864 }
1865
1866 emit(BRW_OPCODE_DO);
1867
1868 if (ir->to) {
1869 this->base_ir = ir->to;
1870 ir->to->accept(this);
1871
1872 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1873 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1874
1875 inst = emit(BRW_OPCODE_BREAK);
1876 inst->predicated = true;
1877 }
1878
1879 foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1880 ir_instruction *ir = (ir_instruction *)iter.get();
1881
1882 this->base_ir = ir;
1883 ir->accept(this);
1884 }
1885
1886 if (ir->increment) {
1887 this->base_ir = ir->increment;
1888 ir->increment->accept(this);
1889 emit(BRW_OPCODE_ADD, counter, counter, this->result);
1890 }
1891
1892 emit(BRW_OPCODE_WHILE);
1893 }
1894
1895 void
1896 fs_visitor::visit(ir_loop_jump *ir)
1897 {
1898 switch (ir->mode) {
1899 case ir_loop_jump::jump_break:
1900 emit(BRW_OPCODE_BREAK);
1901 break;
1902 case ir_loop_jump::jump_continue:
1903 emit(BRW_OPCODE_CONTINUE);
1904 break;
1905 }
1906 }
1907
1908 void
1909 fs_visitor::visit(ir_call *ir)
1910 {
1911 assert(!"FINISHME");
1912 }
1913
1914 void
1915 fs_visitor::visit(ir_return *ir)
1916 {
1917 assert(!"FINISHME");
1918 }
1919
1920 void
1921 fs_visitor::visit(ir_function *ir)
1922 {
1923 /* Ignore function bodies other than main() -- we shouldn't see calls to
1924 * them since they should all be inlined before we get to ir_to_mesa.
1925 */
1926 if (strcmp(ir->name, "main") == 0) {
1927 const ir_function_signature *sig;
1928 exec_list empty;
1929
1930 sig = ir->matching_signature(&empty);
1931
1932 assert(sig);
1933
1934 foreach_iter(exec_list_iterator, iter, sig->body) {
1935 ir_instruction *ir = (ir_instruction *)iter.get();
1936 this->base_ir = ir;
1937
1938 ir->accept(this);
1939 }
1940 }
1941 }
1942
1943 void
1944 fs_visitor::visit(ir_function_signature *ir)
1945 {
1946 assert(!"not reached");
1947 (void)ir;
1948 }
1949
1950 fs_inst *
1951 fs_visitor::emit(fs_inst inst)
1952 {
1953 fs_inst *list_inst = new(mem_ctx) fs_inst;
1954 *list_inst = inst;
1955
1956 if (force_uncompressed_stack > 0)
1957 list_inst->force_uncompressed = true;
1958 else if (force_sechalf_stack > 0)
1959 list_inst->force_sechalf = true;
1960
1961 list_inst->annotation = this->current_annotation;
1962 list_inst->ir = this->base_ir;
1963
1964 this->instructions.push_tail(list_inst);
1965
1966 return list_inst;
1967 }
1968
1969 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1970 void
1971 fs_visitor::emit_dummy_fs()
1972 {
1973 /* Everyone's favorite color. */
1974 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1975 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1976 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1977 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1978
1979 fs_inst *write;
1980 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1981 write->base_mrf = 0;
1982 }
1983
1984 /* The register location here is relative to the start of the URB
1985 * data. It will get adjusted to be a real location before
1986 * generate_code() time.
1987 */
1988 struct brw_reg
1989 fs_visitor::interp_reg(int location, int channel)
1990 {
1991 int regnr = urb_setup[location] * 2 + channel / 2;
1992 int stride = (channel & 1) * 4;
1993
1994 assert(urb_setup[location] != -1);
1995
1996 return brw_vec1_grf(regnr, stride);
1997 }
1998
1999 /** Emits the interpolation for the varying inputs. */
2000 void
2001 fs_visitor::emit_interpolation_setup_gen4()
2002 {
2003 this->current_annotation = "compute pixel centers";
2004 this->pixel_x = fs_reg(this, glsl_type::uint_type);
2005 this->pixel_y = fs_reg(this, glsl_type::uint_type);
2006 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2007 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2008
2009 emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2010 emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2011
2012 this->current_annotation = "compute pixel deltas from v0";
2013 if (brw->has_pln) {
2014 this->delta_x = fs_reg(this, glsl_type::vec2_type);
2015 this->delta_y = this->delta_x;
2016 this->delta_y.reg_offset++;
2017 } else {
2018 this->delta_x = fs_reg(this, glsl_type::float_type);
2019 this->delta_y = fs_reg(this, glsl_type::float_type);
2020 }
2021 emit(BRW_OPCODE_ADD, this->delta_x,
2022 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
2023 emit(BRW_OPCODE_ADD, this->delta_y,
2024 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
2025
2026 this->current_annotation = "compute pos.w and 1/pos.w";
2027 /* Compute wpos.w. It's always in our setup, since it's needed to
2028 * interpolate the other attributes.
2029 */
2030 this->wpos_w = fs_reg(this, glsl_type::float_type);
2031 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
2032 interp_reg(FRAG_ATTRIB_WPOS, 3));
2033 /* Compute the pixel 1/W value from wpos.w. */
2034 this->pixel_w = fs_reg(this, glsl_type::float_type);
2035 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2036 this->current_annotation = NULL;
2037 }
2038
2039 /** Emits the interpolation for the varying inputs. */
2040 void
2041 fs_visitor::emit_interpolation_setup_gen6()
2042 {
2043 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2044
2045 /* If the pixel centers end up used, the setup is the same as for gen4. */
2046 this->current_annotation = "compute pixel centers";
2047 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2048 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2049 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2050 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2051 emit(BRW_OPCODE_ADD,
2052 int_pixel_x,
2053 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2054 fs_reg(brw_imm_v(0x10101010)));
2055 emit(BRW_OPCODE_ADD,
2056 int_pixel_y,
2057 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2058 fs_reg(brw_imm_v(0x11001100)));
2059
2060 /* As of gen6, we can no longer mix float and int sources. We have
2061 * to turn the integer pixel centers into floats for their actual
2062 * use.
2063 */
2064 this->pixel_x = fs_reg(this, glsl_type::float_type);
2065 this->pixel_y = fs_reg(this, glsl_type::float_type);
2066 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2067 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2068
2069 this->current_annotation = "compute pos.w";
2070 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2071 this->wpos_w = fs_reg(this, glsl_type::float_type);
2072 emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
2073
2074 this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2075 this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2076
2077 this->current_annotation = NULL;
2078 }
2079
2080 void
2081 fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
2082 {
2083 int reg_width = c->dispatch_width / 8;
2084
2085 if (c->dispatch_width == 8 || intel->gen == 6) {
2086 /* SIMD8 write looks like:
2087 * m + 0: r0
2088 * m + 1: r1
2089 * m + 2: g0
2090 * m + 3: g1
2091 *
2092 * gen6 SIMD16 DP write looks like:
2093 * m + 0: r0
2094 * m + 1: r1
2095 * m + 2: g0
2096 * m + 3: g1
2097 * m + 4: b0
2098 * m + 5: b1
2099 * m + 6: a0
2100 * m + 7: a1
2101 */
2102 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
2103 color);
2104 } else {
2105 /* pre-gen6 SIMD16 single source DP write looks like:
2106 * m + 0: r0
2107 * m + 1: g0
2108 * m + 2: b0
2109 * m + 3: a0
2110 * m + 4: r1
2111 * m + 5: g1
2112 * m + 6: b1
2113 * m + 7: a1
2114 */
2115 if (brw->has_compr4) {
2116 /* By setting the high bit of the MRF register number, we
2117 * indicate that we want COMPR4 mode - instead of doing the
2118 * usual destination + 1 for the second half we get
2119 * destination + 4.
2120 */
2121 emit(BRW_OPCODE_MOV,
2122 fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
2123 } else {
2124 push_force_uncompressed();
2125 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
2126 pop_force_uncompressed();
2127
2128 push_force_sechalf();
2129 color.sechalf = true;
2130 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
2131 pop_force_sechalf();
2132 color.sechalf = false;
2133 }
2134 }
2135 }
2136
2137 void
2138 fs_visitor::emit_fb_writes()
2139 {
2140 this->current_annotation = "FB write header";
2141 GLboolean header_present = GL_TRUE;
2142 int nr = 0;
2143 int reg_width = c->dispatch_width / 8;
2144
2145 if (intel->gen >= 6 &&
2146 !this->kill_emitted &&
2147 c->key.nr_color_regions == 1) {
2148 header_present = false;
2149 }
2150
2151 if (header_present) {
2152 /* m0, m1 header */
2153 nr += 2;
2154 }
2155
2156 if (c->aa_dest_stencil_reg) {
2157 push_force_uncompressed();
2158 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2159 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2160 pop_force_uncompressed();
2161 }
2162
2163 /* Reserve space for color. It'll be filled in per MRT below. */
2164 int color_mrf = nr;
2165 nr += 4 * reg_width;
2166
2167 if (c->source_depth_to_render_target) {
2168 if (intel->gen == 6 && c->dispatch_width == 16) {
2169 /* For outputting oDepth on gen6, SIMD8 writes have to be
2170 * used. This would require 8-wide moves of each half to
2171 * message regs, kind of like pre-gen5 SIMD16 FB writes.
2172 * Just bail on doing so for now.
2173 */
2174 fail("Missing support for simd16 depth writes on gen6\n");
2175 }
2176
2177 if (c->computes_depth) {
2178 /* Hand over gl_FragDepth. */
2179 assert(this->frag_depth);
2180 fs_reg depth = *(variable_storage(this->frag_depth));
2181
2182 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2183 } else {
2184 /* Pass through the payload depth. */
2185 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2186 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2187 }
2188 nr += reg_width;
2189 }
2190
2191 if (c->dest_depth_reg) {
2192 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2193 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2194 nr += reg_width;
2195 }
2196
2197 fs_reg color = reg_undef;
2198 if (this->frag_color)
2199 color = *(variable_storage(this->frag_color));
2200 else if (this->frag_data) {
2201 color = *(variable_storage(this->frag_data));
2202 color.type = BRW_REGISTER_TYPE_F;
2203 }
2204
2205 for (int target = 0; target < c->key.nr_color_regions; target++) {
2206 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2207 "FB write target %d",
2208 target);
2209 if (this->frag_color || this->frag_data) {
2210 for (int i = 0; i < 4; i++) {
2211 emit_color_write(i, color_mrf, color);
2212 color.reg_offset++;
2213 }
2214 }
2215
2216 if (this->frag_color)
2217 color.reg_offset -= 4;
2218
2219 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2220 inst->target = target;
2221 inst->base_mrf = 0;
2222 inst->mlen = nr;
2223 if (target == c->key.nr_color_regions - 1)
2224 inst->eot = true;
2225 inst->header_present = header_present;
2226 }
2227
2228 if (c->key.nr_color_regions == 0) {
2229 if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2230 /* If the alpha test is enabled but there's no color buffer,
2231 * we still need to send alpha out the pipeline to our null
2232 * renderbuffer.
2233 */
2234 color.reg_offset += 3;
2235 emit_color_write(3, color_mrf, color);
2236 }
2237
2238 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2239 inst->base_mrf = 0;
2240 inst->mlen = nr;
2241 inst->eot = true;
2242 inst->header_present = header_present;
2243 }
2244
2245 this->current_annotation = NULL;
2246 }
2247
2248 void
2249 fs_visitor::generate_fb_write(fs_inst *inst)
2250 {
2251 GLboolean eot = inst->eot;
2252 struct brw_reg implied_header;
2253
2254 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2255 * move, here's g1.
2256 */
2257 brw_push_insn_state(p);
2258 brw_set_mask_control(p, BRW_MASK_DISABLE);
2259 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2260
2261 if (inst->header_present) {
2262 if (intel->gen >= 6) {
2263 brw_MOV(p,
2264 brw_message_reg(inst->base_mrf),
2265 brw_vec8_grf(0, 0));
2266
2267 if (inst->target > 0) {
2268 /* Set the render target index for choosing BLEND_STATE. */
2269 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2270 BRW_REGISTER_TYPE_UD),
2271 brw_imm_ud(inst->target));
2272 }
2273
2274 /* Clear viewport index, render target array index. */
2275 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2276 BRW_REGISTER_TYPE_UD),
2277 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2278 brw_imm_ud(0xf7ff));
2279
2280 implied_header = brw_null_reg();
2281 } else {
2282 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2283 }
2284
2285 brw_MOV(p,
2286 brw_message_reg(inst->base_mrf + 1),
2287 brw_vec8_grf(1, 0));
2288 } else {
2289 implied_header = brw_null_reg();
2290 }
2291
2292 brw_pop_insn_state(p);
2293
2294 brw_fb_WRITE(p,
2295 c->dispatch_width,
2296 inst->base_mrf,
2297 implied_header,
2298 inst->target,
2299 inst->mlen,
2300 0,
2301 eot,
2302 inst->header_present);
2303 }
2304
2305 /* Computes the integer pixel x,y values from the origin.
2306 *
2307 * This is the basis of gl_FragCoord computation, but is also used
2308 * pre-gen6 for computing the deltas from v0 for computing
2309 * interpolation.
2310 */
2311 void
2312 fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
2313 {
2314 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2315 struct brw_reg src;
2316 struct brw_reg deltas;
2317
2318 if (is_x) {
2319 src = stride(suboffset(g1_uw, 4), 2, 4, 0);
2320 deltas = brw_imm_v(0x10101010);
2321 } else {
2322 src = stride(suboffset(g1_uw, 5), 2, 4, 0);
2323 deltas = brw_imm_v(0x11001100);
2324 }
2325
2326 if (c->dispatch_width == 16) {
2327 dst = vec16(dst);
2328 }
2329
2330 /* We do this 8 or 16-wide, but since the destination is UW we
2331 * don't do compression in the 16-wide case.
2332 */
2333 brw_push_insn_state(p);
2334 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2335 brw_ADD(p, dst, src, deltas);
2336 brw_pop_insn_state(p);
2337 }
2338
2339 void
2340 fs_visitor::generate_linterp(fs_inst *inst,
2341 struct brw_reg dst, struct brw_reg *src)
2342 {
2343 struct brw_reg delta_x = src[0];
2344 struct brw_reg delta_y = src[1];
2345 struct brw_reg interp = src[2];
2346
2347 if (brw->has_pln &&
2348 delta_y.nr == delta_x.nr + 1 &&
2349 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2350 brw_PLN(p, dst, interp, delta_x);
2351 } else {
2352 brw_LINE(p, brw_null_reg(), interp, delta_x);
2353 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2354 }
2355 }
2356
2357 void
2358 fs_visitor::generate_math(fs_inst *inst,
2359 struct brw_reg dst, struct brw_reg *src)
2360 {
2361 int op;
2362
2363 switch (inst->opcode) {
2364 case FS_OPCODE_RCP:
2365 op = BRW_MATH_FUNCTION_INV;
2366 break;
2367 case FS_OPCODE_RSQ:
2368 op = BRW_MATH_FUNCTION_RSQ;
2369 break;
2370 case FS_OPCODE_SQRT:
2371 op = BRW_MATH_FUNCTION_SQRT;
2372 break;
2373 case FS_OPCODE_EXP2:
2374 op = BRW_MATH_FUNCTION_EXP;
2375 break;
2376 case FS_OPCODE_LOG2:
2377 op = BRW_MATH_FUNCTION_LOG;
2378 break;
2379 case FS_OPCODE_POW:
2380 op = BRW_MATH_FUNCTION_POW;
2381 break;
2382 case FS_OPCODE_SIN:
2383 op = BRW_MATH_FUNCTION_SIN;
2384 break;
2385 case FS_OPCODE_COS:
2386 op = BRW_MATH_FUNCTION_COS;
2387 break;
2388 default:
2389 assert(!"not reached: unknown math function");
2390 op = 0;
2391 break;
2392 }
2393
2394 if (intel->gen >= 6) {
2395 assert(inst->mlen == 0);
2396
2397 if (inst->opcode == FS_OPCODE_POW) {
2398 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2399 brw_math2(p, dst, op, src[0], src[1]);
2400
2401 if (c->dispatch_width == 16) {
2402 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2403 brw_math2(p, sechalf(dst), op, sechalf(src[0]), sechalf(src[1]));
2404 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2405 }
2406 } else {
2407 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2408 brw_math(p, dst,
2409 op,
2410 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2411 BRW_MATH_SATURATE_NONE,
2412 0, src[0],
2413 BRW_MATH_DATA_VECTOR,
2414 BRW_MATH_PRECISION_FULL);
2415
2416 if (c->dispatch_width == 16) {
2417 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2418 brw_math(p, sechalf(dst),
2419 op,
2420 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2421 BRW_MATH_SATURATE_NONE,
2422 0, sechalf(src[0]),
2423 BRW_MATH_DATA_VECTOR,
2424 BRW_MATH_PRECISION_FULL);
2425 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2426 }
2427 }
2428 } else /* gen <= 5 */{
2429 assert(inst->mlen >= 1);
2430
2431 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2432 brw_math(p, dst,
2433 op,
2434 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2435 BRW_MATH_SATURATE_NONE,
2436 inst->base_mrf, src[0],
2437 BRW_MATH_DATA_VECTOR,
2438 BRW_MATH_PRECISION_FULL);
2439
2440 if (c->dispatch_width == 16) {
2441 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
2442 brw_math(p, sechalf(dst),
2443 op,
2444 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2445 BRW_MATH_SATURATE_NONE,
2446 inst->base_mrf + 1, sechalf(src[0]),
2447 BRW_MATH_DATA_VECTOR,
2448 BRW_MATH_PRECISION_FULL);
2449
2450 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2451 }
2452 }
2453 }
2454
2455 void
2456 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2457 {
2458 int msg_type = -1;
2459 int rlen = 4;
2460 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2461
2462 if (c->dispatch_width == 16) {
2463 rlen = 8;
2464 dst = vec16(dst);
2465 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2466 }
2467
2468 if (intel->gen >= 5) {
2469 switch (inst->opcode) {
2470 case FS_OPCODE_TEX:
2471 if (inst->shadow_compare) {
2472 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2473 } else {
2474 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2475 }
2476 break;
2477 case FS_OPCODE_TXB:
2478 if (inst->shadow_compare) {
2479 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2480 } else {
2481 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2482 }
2483 break;
2484 case FS_OPCODE_TXL:
2485 if (inst->shadow_compare) {
2486 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2487 } else {
2488 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2489 }
2490 break;
2491 case FS_OPCODE_TXD:
2492 assert(!"TXD isn't supported on gen5+ yet.");
2493 break;
2494 }
2495 } else {
2496 switch (inst->opcode) {
2497 case FS_OPCODE_TEX:
2498 /* Note that G45 and older determines shadow compare and dispatch width
2499 * from message length for most messages.
2500 */
2501 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2502 if (inst->shadow_compare) {
2503 assert(inst->mlen == 6);
2504 } else {
2505 assert(inst->mlen <= 4);
2506 }
2507 break;
2508 case FS_OPCODE_TXB:
2509 if (inst->shadow_compare) {
2510 assert(inst->mlen == 6);
2511 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2512 } else {
2513 assert(inst->mlen == 9);
2514 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2515 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2516 }
2517 break;
2518 case FS_OPCODE_TXL:
2519 if (inst->shadow_compare) {
2520 assert(inst->mlen == 6);
2521 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2522 } else {
2523 assert(inst->mlen == 9);
2524 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2525 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2526 }
2527 break;
2528 case FS_OPCODE_TXD:
2529 assert(!"TXD isn't supported on gen4 yet.");
2530 break;
2531 }
2532 }
2533 assert(msg_type != -1);
2534
2535 brw_SAMPLE(p,
2536 retype(dst, BRW_REGISTER_TYPE_UW),
2537 inst->base_mrf,
2538 src,
2539 SURF_INDEX_TEXTURE(inst->sampler),
2540 inst->sampler,
2541 WRITEMASK_XYZW,
2542 msg_type,
2543 rlen,
2544 inst->mlen,
2545 0,
2546 1,
2547 simd_mode);
2548 }
2549
2550
2551 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2552 * looking like:
2553 *
2554 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2555 *
2556 * and we're trying to produce:
2557 *
2558 * DDX DDY
2559 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
2560 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
2561 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
2562 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
2563 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
2564 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
2565 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
2566 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
2567 *
2568 * and add another set of two more subspans if in 16-pixel dispatch mode.
2569 *
2570 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2571 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2572 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2573 * between each other. We could probably do it like ddx and swizzle the right
2574 * order later, but bail for now and just produce
2575 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2576 */
2577 void
2578 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2579 {
2580 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2581 BRW_REGISTER_TYPE_F,
2582 BRW_VERTICAL_STRIDE_2,
2583 BRW_WIDTH_2,
2584 BRW_HORIZONTAL_STRIDE_0,
2585 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2586 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2587 BRW_REGISTER_TYPE_F,
2588 BRW_VERTICAL_STRIDE_2,
2589 BRW_WIDTH_2,
2590 BRW_HORIZONTAL_STRIDE_0,
2591 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2592 brw_ADD(p, dst, src0, negate(src1));
2593 }
2594
2595 void
2596 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2597 {
2598 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2599 BRW_REGISTER_TYPE_F,
2600 BRW_VERTICAL_STRIDE_4,
2601 BRW_WIDTH_4,
2602 BRW_HORIZONTAL_STRIDE_0,
2603 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2604 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2605 BRW_REGISTER_TYPE_F,
2606 BRW_VERTICAL_STRIDE_4,
2607 BRW_WIDTH_4,
2608 BRW_HORIZONTAL_STRIDE_0,
2609 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2610 brw_ADD(p, dst, src0, negate(src1));
2611 }
2612
2613 void
2614 fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2615 {
2616 if (intel->gen >= 6) {
2617 /* Gen6 no longer has the mask reg for us to just read the
2618 * active channels from. However, cmp updates just the channels
2619 * of the flag reg that are enabled, so we can get at the
2620 * channel enables that way. In this step, make a reg of ones
2621 * we'll compare to.
2622 */
2623 brw_MOV(p, mask, brw_imm_ud(1));
2624 } else {
2625 brw_push_insn_state(p);
2626 brw_set_mask_control(p, BRW_MASK_DISABLE);
2627 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2628 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2629 brw_pop_insn_state(p);
2630 }
2631 }
2632
2633 void
2634 fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2635 {
2636 if (intel->gen >= 6) {
2637 struct brw_reg f0 = brw_flag_reg();
2638 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2639
2640 brw_push_insn_state(p);
2641 brw_set_mask_control(p, BRW_MASK_DISABLE);
2642 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2643 brw_pop_insn_state(p);
2644
2645 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2646 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2647 /* Undo CMP's whacking of predication*/
2648 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2649
2650 brw_push_insn_state(p);
2651 brw_set_mask_control(p, BRW_MASK_DISABLE);
2652 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2653 brw_AND(p, g1, f0, g1);
2654 brw_pop_insn_state(p);
2655 } else {
2656 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2657
2658 mask = brw_uw1_reg(mask.file, mask.nr, 0);
2659
2660 brw_push_insn_state(p);
2661 brw_set_mask_control(p, BRW_MASK_DISABLE);
2662 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2663 brw_AND(p, g0, mask, g0);
2664 brw_pop_insn_state(p);
2665 }
2666 }
2667
2668 void
2669 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2670 {
2671 assert(inst->mlen != 0);
2672
2673 brw_MOV(p,
2674 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2675 retype(src, BRW_REGISTER_TYPE_UD));
2676 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2677 inst->offset);
2678 }
2679
2680 void
2681 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2682 {
2683 assert(inst->mlen != 0);
2684
2685 /* Clear any post destination dependencies that would be ignored by
2686 * the block read. See the B-Spec for pre-gen5 send instruction.
2687 *
2688 * This could use a better solution, since texture sampling and
2689 * math reads could potentially run into it as well -- anywhere
2690 * that we have a SEND with a destination that is a register that
2691 * was written but not read within the last N instructions (what's
2692 * N? unsure). This is rare because of dead code elimination, but
2693 * not impossible.
2694 */
2695 if (intel->gen == 4 && !intel->is_g4x)
2696 brw_MOV(p, brw_null_reg(), dst);
2697
2698 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2699 inst->offset);
2700
2701 if (intel->gen == 4 && !intel->is_g4x) {
2702 /* gen4 errata: destination from a send can't be used as a
2703 * destination until it's been read. Just read it so we don't
2704 * have to worry.
2705 */
2706 brw_MOV(p, brw_null_reg(), dst);
2707 }
2708 }
2709
2710
2711 void
2712 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2713 {
2714 assert(inst->mlen != 0);
2715
2716 /* Clear any post destination dependencies that would be ignored by
2717 * the block read. See the B-Spec for pre-gen5 send instruction.
2718 *
2719 * This could use a better solution, since texture sampling and
2720 * math reads could potentially run into it as well -- anywhere
2721 * that we have a SEND with a destination that is a register that
2722 * was written but not read within the last N instructions (what's
2723 * N? unsure). This is rare because of dead code elimination, but
2724 * not impossible.
2725 */
2726 if (intel->gen == 4 && !intel->is_g4x)
2727 brw_MOV(p, brw_null_reg(), dst);
2728
2729 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2730 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2731
2732 if (intel->gen == 4 && !intel->is_g4x) {
2733 /* gen4 errata: destination from a send can't be used as a
2734 * destination until it's been read. Just read it so we don't
2735 * have to worry.
2736 */
2737 brw_MOV(p, brw_null_reg(), dst);
2738 }
2739 }
2740
2741 /**
2742 * To be called after the last _mesa_add_state_reference() call, to
2743 * set up prog_data.param[] for assign_curb_setup() and
2744 * setup_pull_constants().
2745 */
2746 void
2747 fs_visitor::setup_paramvalues_refs()
2748 {
2749 if (c->dispatch_width != 8)
2750 return;
2751
2752 /* Set up the pointers to ParamValues now that that array is finalized. */
2753 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2754 c->prog_data.param[i] =
2755 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2756 this->param_offset[i];
2757 }
2758 }
2759
2760 void
2761 fs_visitor::assign_curb_setup()
2762 {
2763 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2764 if (c->dispatch_width == 8) {
2765 c->prog_data.first_curbe_grf = c->nr_payload_regs;
2766 } else {
2767 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
2768 }
2769
2770 /* Map the offsets in the UNIFORM file to fixed HW regs. */
2771 foreach_iter(exec_list_iterator, iter, this->instructions) {
2772 fs_inst *inst = (fs_inst *)iter.get();
2773
2774 for (unsigned int i = 0; i < 3; i++) {
2775 if (inst->src[i].file == UNIFORM) {
2776 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2777 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
2778 constant_nr / 8,
2779 constant_nr % 8);
2780
2781 inst->src[i].file = FIXED_HW_REG;
2782 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2783 }
2784 }
2785 }
2786 }
2787
2788 void
2789 fs_visitor::calculate_urb_setup()
2790 {
2791 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2792 urb_setup[i] = -1;
2793 }
2794
2795 int urb_next = 0;
2796 /* Figure out where each of the incoming setup attributes lands. */
2797 if (intel->gen >= 6) {
2798 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2799 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2800 urb_setup[i] = urb_next++;
2801 }
2802 }
2803 } else {
2804 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2805 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2806 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2807 int fp_index;
2808
2809 if (i >= VERT_RESULT_VAR0)
2810 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2811 else if (i <= VERT_RESULT_TEX7)
2812 fp_index = i;
2813 else
2814 fp_index = -1;
2815
2816 if (fp_index >= 0)
2817 urb_setup[fp_index] = urb_next++;
2818 }
2819 }
2820 }
2821
2822 /* Each attribute is 4 setup channels, each of which is half a reg. */
2823 c->prog_data.urb_read_length = urb_next * 2;
2824 }
2825
2826 void
2827 fs_visitor::assign_urb_setup()
2828 {
2829 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
2830
2831 /* Offset all the urb_setup[] index by the actual position of the
2832 * setup regs, now that the location of the constants has been chosen.
2833 */
2834 foreach_iter(exec_list_iterator, iter, this->instructions) {
2835 fs_inst *inst = (fs_inst *)iter.get();
2836
2837 if (inst->opcode == FS_OPCODE_LINTERP) {
2838 assert(inst->src[2].file == FIXED_HW_REG);
2839 inst->src[2].fixed_hw_reg.nr += urb_start;
2840 }
2841
2842 if (inst->opcode == FS_OPCODE_CINTERP) {
2843 assert(inst->src[0].file == FIXED_HW_REG);
2844 inst->src[0].fixed_hw_reg.nr += urb_start;
2845 }
2846 }
2847
2848 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2849 }
2850
2851 /**
2852 * Split large virtual GRFs into separate components if we can.
2853 *
2854 * This is mostly duplicated with what brw_fs_vector_splitting does,
2855 * but that's really conservative because it's afraid of doing
2856 * splitting that doesn't result in real progress after the rest of
2857 * the optimization phases, which would cause infinite looping in
2858 * optimization. We can do it once here, safely. This also has the
2859 * opportunity to split interpolated values, or maybe even uniforms,
2860 * which we don't have at the IR level.
2861 *
2862 * We want to split, because virtual GRFs are what we register
2863 * allocate and spill (due to contiguousness requirements for some
2864 * instructions), and they're what we naturally generate in the
2865 * codegen process, but most virtual GRFs don't actually need to be
2866 * contiguous sets of GRFs. If we split, we'll end up with reduced
2867 * live intervals and better dead code elimination and coalescing.
2868 */
2869 void
2870 fs_visitor::split_virtual_grfs()
2871 {
2872 int num_vars = this->virtual_grf_next;
2873 bool split_grf[num_vars];
2874 int new_virtual_grf[num_vars];
2875
2876 /* Try to split anything > 0 sized. */
2877 for (int i = 0; i < num_vars; i++) {
2878 if (this->virtual_grf_sizes[i] != 1)
2879 split_grf[i] = true;
2880 else
2881 split_grf[i] = false;
2882 }
2883
2884 if (brw->has_pln) {
2885 /* PLN opcodes rely on the delta_xy being contiguous. */
2886 split_grf[this->delta_x.reg] = false;
2887 }
2888
2889 foreach_iter(exec_list_iterator, iter, this->instructions) {
2890 fs_inst *inst = (fs_inst *)iter.get();
2891
2892 /* Texturing produces 4 contiguous registers, so no splitting. */
2893 if (inst->is_tex()) {
2894 split_grf[inst->dst.reg] = false;
2895 }
2896 }
2897
2898 /* Allocate new space for split regs. Note that the virtual
2899 * numbers will be contiguous.
2900 */
2901 for (int i = 0; i < num_vars; i++) {
2902 if (split_grf[i]) {
2903 new_virtual_grf[i] = virtual_grf_alloc(1);
2904 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2905 int reg = virtual_grf_alloc(1);
2906 assert(reg == new_virtual_grf[i] + j - 1);
2907 (void) reg;
2908 }
2909 this->virtual_grf_sizes[i] = 1;
2910 }
2911 }
2912
2913 foreach_iter(exec_list_iterator, iter, this->instructions) {
2914 fs_inst *inst = (fs_inst *)iter.get();
2915
2916 if (inst->dst.file == GRF &&
2917 split_grf[inst->dst.reg] &&
2918 inst->dst.reg_offset != 0) {
2919 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2920 inst->dst.reg_offset - 1);
2921 inst->dst.reg_offset = 0;
2922 }
2923 for (int i = 0; i < 3; i++) {
2924 if (inst->src[i].file == GRF &&
2925 split_grf[inst->src[i].reg] &&
2926 inst->src[i].reg_offset != 0) {
2927 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2928 inst->src[i].reg_offset - 1);
2929 inst->src[i].reg_offset = 0;
2930 }
2931 }
2932 }
2933 this->live_intervals_valid = false;
2934 }
2935
2936 /**
2937 * Choose accesses from the UNIFORM file to demote to using the pull
2938 * constant buffer.
2939 *
2940 * We allow a fragment shader to have more than the specified minimum
2941 * maximum number of fragment shader uniform components (64). If
2942 * there are too many of these, they'd fill up all of register space.
2943 * So, this will push some of them out to the pull constant buffer and
2944 * update the program to load them.
2945 */
2946 void
2947 fs_visitor::setup_pull_constants()
2948 {
2949 /* Only allow 16 registers (128 uniform components) as push constants. */
2950 unsigned int max_uniform_components = 16 * 8;
2951 if (c->prog_data.nr_params <= max_uniform_components)
2952 return;
2953
2954 if (c->dispatch_width == 16) {
2955 fail("Pull constants not supported in 16-wide\n");
2956 return;
2957 }
2958
2959 /* Just demote the end of the list. We could probably do better
2960 * here, demoting things that are rarely used in the program first.
2961 */
2962 int pull_uniform_base = max_uniform_components;
2963 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2964
2965 foreach_iter(exec_list_iterator, iter, this->instructions) {
2966 fs_inst *inst = (fs_inst *)iter.get();
2967
2968 for (int i = 0; i < 3; i++) {
2969 if (inst->src[i].file != UNIFORM)
2970 continue;
2971
2972 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2973 if (uniform_nr < pull_uniform_base)
2974 continue;
2975
2976 fs_reg dst = fs_reg(this, glsl_type::float_type);
2977 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2978 dst);
2979 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2980 pull->ir = inst->ir;
2981 pull->annotation = inst->annotation;
2982 pull->base_mrf = 14;
2983 pull->mlen = 1;
2984
2985 inst->insert_before(pull);
2986
2987 inst->src[i].file = GRF;
2988 inst->src[i].reg = dst.reg;
2989 inst->src[i].reg_offset = 0;
2990 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2991 }
2992 }
2993
2994 for (int i = 0; i < pull_uniform_count; i++) {
2995 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2996 c->prog_data.pull_param_convert[i] =
2997 c->prog_data.param_convert[pull_uniform_base + i];
2998 }
2999 c->prog_data.nr_params -= pull_uniform_count;
3000 c->prog_data.nr_pull_params = pull_uniform_count;
3001 }
3002
3003 void
3004 fs_visitor::calculate_live_intervals()
3005 {
3006 int num_vars = this->virtual_grf_next;
3007 int *def = ralloc_array(mem_ctx, int, num_vars);
3008 int *use = ralloc_array(mem_ctx, int, num_vars);
3009 int loop_depth = 0;
3010 int loop_start = 0;
3011 int bb_header_ip = 0;
3012
3013 if (this->live_intervals_valid)
3014 return;
3015
3016 for (int i = 0; i < num_vars; i++) {
3017 def[i] = MAX_INSTRUCTION;
3018 use[i] = -1;
3019 }
3020
3021 int ip = 0;
3022 foreach_iter(exec_list_iterator, iter, this->instructions) {
3023 fs_inst *inst = (fs_inst *)iter.get();
3024
3025 if (inst->opcode == BRW_OPCODE_DO) {
3026 if (loop_depth++ == 0)
3027 loop_start = ip;
3028 } else if (inst->opcode == BRW_OPCODE_WHILE) {
3029 loop_depth--;
3030
3031 if (loop_depth == 0) {
3032 /* Patches up the use of vars marked for being live across
3033 * the whole loop.
3034 */
3035 for (int i = 0; i < num_vars; i++) {
3036 if (use[i] == loop_start) {
3037 use[i] = ip;
3038 }
3039 }
3040 }
3041 } else {
3042 for (unsigned int i = 0; i < 3; i++) {
3043 if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
3044 int reg = inst->src[i].reg;
3045
3046 if (!loop_depth) {
3047 use[reg] = ip;
3048 } else {
3049 def[reg] = MIN2(loop_start, def[reg]);
3050 use[reg] = loop_start;
3051
3052 /* Nobody else is going to go smash our start to
3053 * later in the loop now, because def[reg] now
3054 * points before the bb header.
3055 */
3056 }
3057 }
3058 }
3059 if (inst->dst.file == GRF && inst->dst.reg != 0) {
3060 int reg = inst->dst.reg;
3061
3062 if (!loop_depth) {
3063 def[reg] = MIN2(def[reg], ip);
3064 } else {
3065 def[reg] = MIN2(def[reg], loop_start);
3066 }
3067 }
3068 }
3069
3070 ip++;
3071
3072 /* Set the basic block header IP. This is used for determining
3073 * if a complete def of single-register virtual GRF in a loop
3074 * dominates a use in the same basic block. It's a quick way to
3075 * reduce the live interval range of most register used in a
3076 * loop.
3077 */
3078 if (inst->opcode == BRW_OPCODE_IF ||
3079 inst->opcode == BRW_OPCODE_ELSE ||
3080 inst->opcode == BRW_OPCODE_ENDIF ||
3081 inst->opcode == BRW_OPCODE_DO ||
3082 inst->opcode == BRW_OPCODE_WHILE ||
3083 inst->opcode == BRW_OPCODE_BREAK ||
3084 inst->opcode == BRW_OPCODE_CONTINUE) {
3085 bb_header_ip = ip;
3086 }
3087 }
3088
3089 ralloc_free(this->virtual_grf_def);
3090 ralloc_free(this->virtual_grf_use);
3091 this->virtual_grf_def = def;
3092 this->virtual_grf_use = use;
3093
3094 this->live_intervals_valid = true;
3095 }
3096
3097 /**
3098 * Attempts to move immediate constants into the immediate
3099 * constant slot of following instructions.
3100 *
3101 * Immediate constants are a bit tricky -- they have to be in the last
3102 * operand slot, you can't do abs/negate on them,
3103 */
3104
3105 bool
3106 fs_visitor::propagate_constants()
3107 {
3108 bool progress = false;
3109
3110 calculate_live_intervals();
3111
3112 foreach_iter(exec_list_iterator, iter, this->instructions) {
3113 fs_inst *inst = (fs_inst *)iter.get();
3114
3115 if (inst->opcode != BRW_OPCODE_MOV ||
3116 inst->predicated ||
3117 inst->dst.file != GRF || inst->src[0].file != IMM ||
3118 inst->dst.type != inst->src[0].type ||
3119 (c->dispatch_width == 16 &&
3120 (inst->force_uncompressed || inst->force_sechalf)))
3121 continue;
3122
3123 /* Don't bother with cases where we should have had the
3124 * operation on the constant folded in GLSL already.
3125 */
3126 if (inst->saturate)
3127 continue;
3128
3129 /* Found a move of a constant to a GRF. Find anything else using the GRF
3130 * before it's written, and replace it with the constant if we can.
3131 */
3132 exec_list_iterator scan_iter = iter;
3133 scan_iter.next();
3134 for (; scan_iter.has_next(); scan_iter.next()) {
3135 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3136
3137 if (scan_inst->opcode == BRW_OPCODE_DO ||
3138 scan_inst->opcode == BRW_OPCODE_WHILE ||
3139 scan_inst->opcode == BRW_OPCODE_ELSE ||
3140 scan_inst->opcode == BRW_OPCODE_ENDIF) {
3141 break;
3142 }
3143
3144 for (int i = 2; i >= 0; i--) {
3145 if (scan_inst->src[i].file != GRF ||
3146 scan_inst->src[i].reg != inst->dst.reg ||
3147 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
3148 continue;
3149
3150 /* Don't bother with cases where we should have had the
3151 * operation on the constant folded in GLSL already.
3152 */
3153 if (scan_inst->src[i].negate || scan_inst->src[i].abs)
3154 continue;
3155
3156 switch (scan_inst->opcode) {
3157 case BRW_OPCODE_MOV:
3158 scan_inst->src[i] = inst->src[0];
3159 progress = true;
3160 break;
3161
3162 case BRW_OPCODE_MUL:
3163 case BRW_OPCODE_ADD:
3164 if (i == 1) {
3165 scan_inst->src[i] = inst->src[0];
3166 progress = true;
3167 } else if (i == 0 && scan_inst->src[1].file != IMM) {
3168 /* Fit this constant in by commuting the operands */
3169 scan_inst->src[0] = scan_inst->src[1];
3170 scan_inst->src[1] = inst->src[0];
3171 progress = true;
3172 }
3173 break;
3174
3175 case BRW_OPCODE_CMP:
3176 if (i == 1) {
3177 scan_inst->src[i] = inst->src[0];
3178 progress = true;
3179 } else if (i == 0 && scan_inst->src[1].file != IMM) {
3180 uint32_t new_cmod;
3181
3182 new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
3183 if (new_cmod != ~0u) {
3184 /* Fit this constant in by swapping the operands and
3185 * flipping the test
3186 */
3187 scan_inst->src[0] = scan_inst->src[1];
3188 scan_inst->src[1] = inst->src[0];
3189 scan_inst->conditional_mod = new_cmod;
3190 progress = true;
3191 }
3192 }
3193 break;
3194
3195 case BRW_OPCODE_SEL:
3196 if (i == 1) {
3197 scan_inst->src[i] = inst->src[0];
3198 progress = true;
3199 } else if (i == 0 && scan_inst->src[1].file != IMM) {
3200 /* Fit this constant in by swapping the operands and
3201 * flipping the predicate
3202 */
3203 scan_inst->src[0] = scan_inst->src[1];
3204 scan_inst->src[1] = inst->src[0];
3205 scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
3206 progress = true;
3207 }
3208 break;
3209 }
3210 }
3211
3212 if (scan_inst->dst.file == GRF &&
3213 scan_inst->dst.reg == inst->dst.reg &&
3214 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3215 scan_inst->is_tex())) {
3216 break;
3217 }
3218 }
3219 }
3220
3221 if (progress)
3222 this->live_intervals_valid = false;
3223
3224 return progress;
3225 }
3226 /**
3227 * Must be called after calculate_live_intervales() to remove unused
3228 * writes to registers -- register allocation will fail otherwise
3229 * because something deffed but not used won't be considered to
3230 * interfere with other regs.
3231 */
3232 bool
3233 fs_visitor::dead_code_eliminate()
3234 {
3235 bool progress = false;
3236 int pc = 0;
3237
3238 calculate_live_intervals();
3239
3240 foreach_iter(exec_list_iterator, iter, this->instructions) {
3241 fs_inst *inst = (fs_inst *)iter.get();
3242
3243 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
3244 inst->remove();
3245 progress = true;
3246 }
3247
3248 pc++;
3249 }
3250
3251 if (progress)
3252 live_intervals_valid = false;
3253
3254 return progress;
3255 }
3256
3257 bool
3258 fs_visitor::register_coalesce()
3259 {
3260 bool progress = false;
3261 int if_depth = 0;
3262 int loop_depth = 0;
3263
3264 foreach_iter(exec_list_iterator, iter, this->instructions) {
3265 fs_inst *inst = (fs_inst *)iter.get();
3266
3267 /* Make sure that we dominate the instructions we're going to
3268 * scan for interfering with our coalescing, or we won't have
3269 * scanned enough to see if anything interferes with our
3270 * coalescing. We don't dominate the following instructions if
3271 * we're in a loop or an if block.
3272 */
3273 switch (inst->opcode) {
3274 case BRW_OPCODE_DO:
3275 loop_depth++;
3276 break;
3277 case BRW_OPCODE_WHILE:
3278 loop_depth--;
3279 break;
3280 case BRW_OPCODE_IF:
3281 if_depth++;
3282 break;
3283 case BRW_OPCODE_ENDIF:
3284 if_depth--;
3285 break;
3286 }
3287 if (loop_depth || if_depth)
3288 continue;
3289
3290 if (inst->opcode != BRW_OPCODE_MOV ||
3291 inst->predicated ||
3292 inst->saturate ||
3293 inst->dst.file != GRF || inst->src[0].file != GRF ||
3294 inst->dst.type != inst->src[0].type)
3295 continue;
3296
3297 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3298
3299 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
3300 * them: check for no writes to either one until the exit of the
3301 * program.
3302 */
3303 bool interfered = false;
3304 exec_list_iterator scan_iter = iter;
3305 scan_iter.next();
3306 for (; scan_iter.has_next(); scan_iter.next()) {
3307 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3308
3309 if (scan_inst->dst.file == GRF) {
3310 if (scan_inst->dst.reg == inst->dst.reg &&
3311 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3312 scan_inst->is_tex())) {
3313 interfered = true;
3314 break;
3315 }
3316 if (scan_inst->dst.reg == inst->src[0].reg &&
3317 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3318 scan_inst->is_tex())) {
3319 interfered = true;
3320 break;
3321 }
3322 }
3323
3324 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3325 * coalescing those for now. We should do something more specific.
3326 */
3327 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3328 interfered = true;
3329 break;
3330 }
3331 }
3332 if (interfered) {
3333 continue;
3334 }
3335
3336 /* Rewrite the later usage to point at the source of the move to
3337 * be removed.
3338 */
3339 for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3340 scan_iter.next()) {
3341 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3342
3343 for (int i = 0; i < 3; i++) {
3344 if (scan_inst->src[i].file == GRF &&
3345 scan_inst->src[i].reg == inst->dst.reg &&
3346 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3347 scan_inst->src[i].reg = inst->src[0].reg;
3348 scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3349 scan_inst->src[i].abs |= inst->src[0].abs;
3350 scan_inst->src[i].negate ^= inst->src[0].negate;
3351 scan_inst->src[i].smear = inst->src[0].smear;
3352 }
3353 }
3354 }
3355
3356 inst->remove();
3357 progress = true;
3358 }
3359
3360 if (progress)
3361 live_intervals_valid = false;
3362
3363 return progress;
3364 }
3365
3366
3367 bool
3368 fs_visitor::compute_to_mrf()
3369 {
3370 bool progress = false;
3371 int next_ip = 0;
3372
3373 calculate_live_intervals();
3374
3375 foreach_iter(exec_list_iterator, iter, this->instructions) {
3376 fs_inst *inst = (fs_inst *)iter.get();
3377
3378 int ip = next_ip;
3379 next_ip++;
3380
3381 if (inst->opcode != BRW_OPCODE_MOV ||
3382 inst->predicated ||
3383 inst->dst.file != MRF || inst->src[0].file != GRF ||
3384 inst->dst.type != inst->src[0].type ||
3385 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3386 continue;
3387
3388 /* Work out which hardware MRF registers are written by this
3389 * instruction.
3390 */
3391 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
3392 int mrf_high;
3393 if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
3394 mrf_high = mrf_low + 4;
3395 } else if (c->dispatch_width == 16 &&
3396 (!inst->force_uncompressed && !inst->force_sechalf)) {
3397 mrf_high = mrf_low + 1;
3398 } else {
3399 mrf_high = mrf_low;
3400 }
3401
3402 /* Can't compute-to-MRF this GRF if someone else was going to
3403 * read it later.
3404 */
3405 if (this->virtual_grf_use[inst->src[0].reg] > ip)
3406 continue;
3407
3408 /* Found a move of a GRF to a MRF. Let's see if we can go
3409 * rewrite the thing that made this GRF to write into the MRF.
3410 */
3411 fs_inst *scan_inst;
3412 for (scan_inst = (fs_inst *)inst->prev;
3413 scan_inst->prev != NULL;
3414 scan_inst = (fs_inst *)scan_inst->prev) {
3415 if (scan_inst->dst.file == GRF &&
3416 scan_inst->dst.reg == inst->src[0].reg) {
3417 /* Found the last thing to write our reg we want to turn
3418 * into a compute-to-MRF.
3419 */
3420
3421 if (scan_inst->is_tex()) {
3422 /* texturing writes several continuous regs, so we can't
3423 * compute-to-mrf that.
3424 */
3425 break;
3426 }
3427
3428 /* If it's predicated, it (probably) didn't populate all
3429 * the channels. We might be able to rewrite everything
3430 * that writes that reg, but it would require smarter
3431 * tracking to delay the rewriting until complete success.
3432 */
3433 if (scan_inst->predicated)
3434 break;
3435
3436 /* If it's half of register setup and not the same half as
3437 * our MOV we're trying to remove, bail for now.
3438 */
3439 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
3440 scan_inst->force_sechalf != inst->force_sechalf) {
3441 break;
3442 }
3443
3444 /* SEND instructions can't have MRF as a destination. */
3445 if (scan_inst->mlen)
3446 break;
3447
3448 if (intel->gen >= 6) {
3449 /* gen6 math instructions must have the destination be
3450 * GRF, so no compute-to-MRF for them.
3451 */
3452 if (scan_inst->is_math()) {
3453 break;
3454 }
3455 }
3456
3457 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3458 /* Found the creator of our MRF's source value. */
3459 scan_inst->dst.file = MRF;
3460 scan_inst->dst.hw_reg = inst->dst.hw_reg;
3461 scan_inst->saturate |= inst->saturate;
3462 inst->remove();
3463 progress = true;
3464 }
3465 break;
3466 }
3467
3468 /* We don't handle flow control here. Most computation of
3469 * values that end up in MRFs are shortly before the MRF
3470 * write anyway.
3471 */
3472 if (scan_inst->opcode == BRW_OPCODE_DO ||
3473 scan_inst->opcode == BRW_OPCODE_WHILE ||
3474 scan_inst->opcode == BRW_OPCODE_ELSE ||
3475 scan_inst->opcode == BRW_OPCODE_ENDIF) {
3476 break;
3477 }
3478
3479 /* You can't read from an MRF, so if someone else reads our
3480 * MRF's source GRF that we wanted to rewrite, that stops us.
3481 */
3482 bool interfered = false;
3483 for (int i = 0; i < 3; i++) {
3484 if (scan_inst->src[i].file == GRF &&
3485 scan_inst->src[i].reg == inst->src[0].reg &&
3486 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3487 interfered = true;
3488 }
3489 }
3490 if (interfered)
3491 break;
3492
3493 if (scan_inst->dst.file == MRF) {
3494 /* If somebody else writes our MRF here, we can't
3495 * compute-to-MRF before that.
3496 */
3497 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
3498 int scan_mrf_high;
3499
3500 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
3501 scan_mrf_high = scan_mrf_low + 4;
3502 } else if (c->dispatch_width == 16 &&
3503 (!scan_inst->force_uncompressed &&
3504 !scan_inst->force_sechalf)) {
3505 scan_mrf_high = scan_mrf_low + 1;
3506 } else {
3507 scan_mrf_high = scan_mrf_low;
3508 }
3509
3510 if (mrf_low == scan_mrf_low ||
3511 mrf_low == scan_mrf_high ||
3512 mrf_high == scan_mrf_low ||
3513 mrf_high == scan_mrf_high) {
3514 break;
3515 }
3516 }
3517
3518 if (scan_inst->mlen > 0) {
3519 /* Found a SEND instruction, which means that there are
3520 * live values in MRFs from base_mrf to base_mrf +
3521 * scan_inst->mlen - 1. Don't go pushing our MRF write up
3522 * above it.
3523 */
3524 if (mrf_low >= scan_inst->base_mrf &&
3525 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
3526 break;
3527 }
3528 if (mrf_high >= scan_inst->base_mrf &&
3529 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
3530 break;
3531 }
3532 }
3533 }
3534 }
3535
3536 return progress;
3537 }
3538
3539 /**
3540 * Walks through basic blocks, locking for repeated MRF writes and
3541 * removing the later ones.
3542 */
3543 bool
3544 fs_visitor::remove_duplicate_mrf_writes()
3545 {
3546 fs_inst *last_mrf_move[16];
3547 bool progress = false;
3548
3549 /* Need to update the MRF tracking for compressed instructions. */
3550 if (c->dispatch_width == 16)
3551 return false;
3552
3553 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3554
3555 foreach_iter(exec_list_iterator, iter, this->instructions) {
3556 fs_inst *inst = (fs_inst *)iter.get();
3557
3558 switch (inst->opcode) {
3559 case BRW_OPCODE_DO:
3560 case BRW_OPCODE_WHILE:
3561 case BRW_OPCODE_IF:
3562 case BRW_OPCODE_ELSE:
3563 case BRW_OPCODE_ENDIF:
3564 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3565 continue;
3566 default:
3567 break;
3568 }
3569
3570 if (inst->opcode == BRW_OPCODE_MOV &&
3571 inst->dst.file == MRF) {
3572 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3573 if (prev_inst && inst->equals(prev_inst)) {
3574 inst->remove();
3575 progress = true;
3576 continue;
3577 }
3578 }
3579
3580 /* Clear out the last-write records for MRFs that were overwritten. */
3581 if (inst->dst.file == MRF) {
3582 last_mrf_move[inst->dst.hw_reg] = NULL;
3583 }
3584
3585 if (inst->mlen > 0) {
3586 /* Found a SEND instruction, which will include two or fewer
3587 * implied MRF writes. We could do better here.
3588 */
3589 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3590 last_mrf_move[inst->base_mrf + i] = NULL;
3591 }
3592 }
3593
3594 /* Clear out any MRF move records whose sources got overwritten. */
3595 if (inst->dst.file == GRF) {
3596 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3597 if (last_mrf_move[i] &&
3598 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3599 last_mrf_move[i] = NULL;
3600 }
3601 }
3602 }
3603
3604 if (inst->opcode == BRW_OPCODE_MOV &&
3605 inst->dst.file == MRF &&
3606 inst->src[0].file == GRF &&
3607 !inst->predicated) {
3608 last_mrf_move[inst->dst.hw_reg] = inst;
3609 }
3610 }
3611
3612 return progress;
3613 }
3614
3615 bool
3616 fs_visitor::virtual_grf_interferes(int a, int b)
3617 {
3618 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3619 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3620
3621 /* We can't handle dead register writes here, without iterating
3622 * over the whole instruction stream to find every single dead
3623 * write to that register to compare to the live interval of the
3624 * other register. Just assert that dead_code_eliminate() has been
3625 * called.
3626 */
3627 assert((this->virtual_grf_use[a] != -1 ||
3628 this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3629 (this->virtual_grf_use[b] != -1 ||
3630 this->virtual_grf_def[b] == MAX_INSTRUCTION));
3631
3632 /* If the register is used to store 16 values of less than float
3633 * size (only the case for pixel_[xy]), then we can't allocate
3634 * another dword-sized thing to that register that would be used in
3635 * the same instruction. This is because when the GPU decodes (for
3636 * example):
3637 *
3638 * (declare (in ) vec4 gl_FragCoord@0x97766a0)
3639 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr };
3640 *
3641 * it's actually processed as:
3642 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 };
3643 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf };
3644 *
3645 * so our second half values in g6 got overwritten in the first
3646 * half.
3647 */
3648 if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
3649 this->pixel_x.reg == b ||
3650 this->pixel_y.reg == a ||
3651 this->pixel_y.reg == b)) {
3652 return start <= end;
3653 }
3654
3655 return start < end;
3656 }
3657
3658 static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3659 {
3660 struct brw_reg brw_reg;
3661
3662 switch (reg->file) {
3663 case GRF:
3664 case ARF:
3665 case MRF:
3666 if (reg->smear == -1) {
3667 brw_reg = brw_vec8_reg(reg->file,
3668 reg->hw_reg, 0);
3669 } else {
3670 brw_reg = brw_vec1_reg(reg->file,
3671 reg->hw_reg, reg->smear);
3672 }
3673 brw_reg = retype(brw_reg, reg->type);
3674 if (reg->sechalf)
3675 brw_reg = sechalf(brw_reg);
3676 break;
3677 case IMM:
3678 switch (reg->type) {
3679 case BRW_REGISTER_TYPE_F:
3680 brw_reg = brw_imm_f(reg->imm.f);
3681 break;
3682 case BRW_REGISTER_TYPE_D:
3683 brw_reg = brw_imm_d(reg->imm.i);
3684 break;
3685 case BRW_REGISTER_TYPE_UD:
3686 brw_reg = brw_imm_ud(reg->imm.u);
3687 break;
3688 default:
3689 assert(!"not reached");
3690 brw_reg = brw_null_reg();
3691 break;
3692 }
3693 break;
3694 case FIXED_HW_REG:
3695 brw_reg = reg->fixed_hw_reg;
3696 break;
3697 case BAD_FILE:
3698 /* Probably unused. */
3699 brw_reg = brw_null_reg();
3700 break;
3701 case UNIFORM:
3702 assert(!"not reached");
3703 brw_reg = brw_null_reg();
3704 break;
3705 default:
3706 assert(!"not reached");
3707 brw_reg = brw_null_reg();
3708 break;
3709 }
3710 if (reg->abs)
3711 brw_reg = brw_abs(brw_reg);
3712 if (reg->negate)
3713 brw_reg = negate(brw_reg);
3714
3715 return brw_reg;
3716 }
3717
3718 void
3719 fs_visitor::generate_code()
3720 {
3721 int last_native_inst = p->nr_insn;
3722 const char *last_annotation_string = NULL;
3723 ir_instruction *last_annotation_ir = NULL;
3724
3725 int if_stack_array_size = 16;
3726 int loop_stack_array_size = 16;
3727 int if_stack_depth = 0, loop_stack_depth = 0;
3728 brw_instruction **if_stack =
3729 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3730 brw_instruction **loop_stack =
3731 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3732 int *if_depth_in_loop =
3733 rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3734
3735
3736 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3737 printf("Native code for fragment shader %d (%d-wide dispatch):\n",
3738 ctx->Shader.CurrentFragmentProgram->Name, c->dispatch_width);
3739 }
3740
3741 foreach_iter(exec_list_iterator, iter, this->instructions) {
3742 fs_inst *inst = (fs_inst *)iter.get();
3743 struct brw_reg src[3], dst;
3744
3745 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3746 if (last_annotation_ir != inst->ir) {
3747 last_annotation_ir = inst->ir;
3748 if (last_annotation_ir) {
3749 printf(" ");
3750 last_annotation_ir->print();
3751 printf("\n");
3752 }
3753 }
3754 if (last_annotation_string != inst->annotation) {
3755 last_annotation_string = inst->annotation;
3756 if (last_annotation_string)
3757 printf(" %s\n", last_annotation_string);
3758 }
3759 }
3760
3761 for (unsigned int i = 0; i < 3; i++) {
3762 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3763 }
3764 dst = brw_reg_from_fs_reg(&inst->dst);
3765
3766 brw_set_conditionalmod(p, inst->conditional_mod);
3767 brw_set_predicate_control(p, inst->predicated);
3768 brw_set_predicate_inverse(p, inst->predicate_inverse);
3769 brw_set_saturate(p, inst->saturate);
3770
3771 if (inst->force_uncompressed || c->dispatch_width == 8) {
3772 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
3773 } else if (inst->force_sechalf) {
3774 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
3775 } else {
3776 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3777 }
3778
3779 switch (inst->opcode) {
3780 case BRW_OPCODE_MOV:
3781 brw_MOV(p, dst, src[0]);
3782 break;
3783 case BRW_OPCODE_ADD:
3784 brw_ADD(p, dst, src[0], src[1]);
3785 break;
3786 case BRW_OPCODE_MUL:
3787 brw_MUL(p, dst, src[0], src[1]);
3788 break;
3789
3790 case BRW_OPCODE_FRC:
3791 brw_FRC(p, dst, src[0]);
3792 break;
3793 case BRW_OPCODE_RNDD:
3794 brw_RNDD(p, dst, src[0]);
3795 break;
3796 case BRW_OPCODE_RNDE:
3797 brw_RNDE(p, dst, src[0]);
3798 break;
3799 case BRW_OPCODE_RNDZ:
3800 brw_RNDZ(p, dst, src[0]);
3801 break;
3802
3803 case BRW_OPCODE_AND:
3804 brw_AND(p, dst, src[0], src[1]);
3805 break;
3806 case BRW_OPCODE_OR:
3807 brw_OR(p, dst, src[0], src[1]);
3808 break;
3809 case BRW_OPCODE_XOR:
3810 brw_XOR(p, dst, src[0], src[1]);
3811 break;
3812 case BRW_OPCODE_NOT:
3813 brw_NOT(p, dst, src[0]);
3814 break;
3815 case BRW_OPCODE_ASR:
3816 brw_ASR(p, dst, src[0], src[1]);
3817 break;
3818 case BRW_OPCODE_SHR:
3819 brw_SHR(p, dst, src[0], src[1]);
3820 break;
3821 case BRW_OPCODE_SHL:
3822 brw_SHL(p, dst, src[0], src[1]);
3823 break;
3824
3825 case BRW_OPCODE_CMP:
3826 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3827 break;
3828 case BRW_OPCODE_SEL:
3829 brw_SEL(p, dst, src[0], src[1]);
3830 break;
3831
3832 case BRW_OPCODE_IF:
3833 if (inst->src[0].file != BAD_FILE) {
3834 assert(intel->gen >= 6);
3835 if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3836 } else {
3837 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3838 }
3839 if_depth_in_loop[loop_stack_depth]++;
3840 if_stack_depth++;
3841 if (if_stack_array_size <= if_stack_depth) {
3842 if_stack_array_size *= 2;
3843 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3844 if_stack_array_size);
3845 }
3846 break;
3847
3848 case BRW_OPCODE_ELSE:
3849 if_stack[if_stack_depth - 1] =
3850 brw_ELSE(p, if_stack[if_stack_depth - 1]);
3851 break;
3852 case BRW_OPCODE_ENDIF:
3853 if_stack_depth--;
3854 brw_ENDIF(p , if_stack[if_stack_depth]);
3855 if_depth_in_loop[loop_stack_depth]--;
3856 break;
3857
3858 case BRW_OPCODE_DO:
3859 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3860 if (loop_stack_array_size <= loop_stack_depth) {
3861 loop_stack_array_size *= 2;
3862 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3863 loop_stack_array_size);
3864 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3865 loop_stack_array_size);
3866 }
3867 if_depth_in_loop[loop_stack_depth] = 0;
3868 break;
3869
3870 case BRW_OPCODE_BREAK:
3871 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3872 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3873 break;
3874 case BRW_OPCODE_CONTINUE:
3875 /* FINISHME: We need to write the loop instruction support still. */
3876 if (intel->gen >= 6)
3877 gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3878 else
3879 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3880 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3881 break;
3882
3883 case BRW_OPCODE_WHILE: {
3884 struct brw_instruction *inst0, *inst1;
3885 GLuint br = 1;
3886
3887 if (intel->gen >= 5)
3888 br = 2;
3889
3890 assert(loop_stack_depth > 0);
3891 loop_stack_depth--;
3892 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3893 if (intel->gen < 6) {
3894 /* patch all the BREAK/CONT instructions from last BGNLOOP */
3895 while (inst0 > loop_stack[loop_stack_depth]) {
3896 inst0--;
3897 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3898 inst0->bits3.if_else.jump_count == 0) {
3899 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3900 }
3901 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3902 inst0->bits3.if_else.jump_count == 0) {
3903 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3904 }
3905 }
3906 }
3907 }
3908 break;
3909
3910 case FS_OPCODE_RCP:
3911 case FS_OPCODE_RSQ:
3912 case FS_OPCODE_SQRT:
3913 case FS_OPCODE_EXP2:
3914 case FS_OPCODE_LOG2:
3915 case FS_OPCODE_POW:
3916 case FS_OPCODE_SIN:
3917 case FS_OPCODE_COS:
3918 generate_math(inst, dst, src);
3919 break;
3920 case FS_OPCODE_PIXEL_X:
3921 generate_pixel_xy(dst, true);
3922 break;
3923 case FS_OPCODE_PIXEL_Y:
3924 generate_pixel_xy(dst, false);
3925 break;
3926 case FS_OPCODE_CINTERP:
3927 brw_MOV(p, dst, src[0]);
3928 break;
3929 case FS_OPCODE_LINTERP:
3930 generate_linterp(inst, dst, src);
3931 break;
3932 case FS_OPCODE_TEX:
3933 case FS_OPCODE_TXB:
3934 case FS_OPCODE_TXD:
3935 case FS_OPCODE_TXL:
3936 generate_tex(inst, dst, src[0]);
3937 break;
3938 case FS_OPCODE_DISCARD_NOT:
3939 generate_discard_not(inst, dst);
3940 break;
3941 case FS_OPCODE_DISCARD_AND:
3942 generate_discard_and(inst, src[0]);
3943 break;
3944 case FS_OPCODE_DDX:
3945 generate_ddx(inst, dst, src[0]);
3946 break;
3947 case FS_OPCODE_DDY:
3948 generate_ddy(inst, dst, src[0]);
3949 break;
3950
3951 case FS_OPCODE_SPILL:
3952 generate_spill(inst, src[0]);
3953 break;
3954
3955 case FS_OPCODE_UNSPILL:
3956 generate_unspill(inst, dst);
3957 break;
3958
3959 case FS_OPCODE_PULL_CONSTANT_LOAD:
3960 generate_pull_constant_load(inst, dst);
3961 break;
3962
3963 case FS_OPCODE_FB_WRITE:
3964 generate_fb_write(inst);
3965 break;
3966 default:
3967 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3968 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3969 brw_opcodes[inst->opcode].name);
3970 } else {
3971 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3972 }
3973 fail("unsupported opcode in FS\n");
3974 }
3975
3976 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3977 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3978 if (0) {
3979 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3980 ((uint32_t *)&p->store[i])[3],
3981 ((uint32_t *)&p->store[i])[2],
3982 ((uint32_t *)&p->store[i])[1],
3983 ((uint32_t *)&p->store[i])[0]);
3984 }
3985 brw_disasm(stdout, &p->store[i], intel->gen);
3986 }
3987 }
3988
3989 last_native_inst = p->nr_insn;
3990 }
3991
3992 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3993 printf("\n");
3994 }
3995
3996 ralloc_free(if_stack);
3997 ralloc_free(loop_stack);
3998 ralloc_free(if_depth_in_loop);
3999
4000 brw_set_uip_jip(p);
4001
4002 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
4003 * emit issues, it doesn't get the jump distances into the output,
4004 * which is often something we want to debug. So this is here in
4005 * case you're doing that.
4006 */
4007 if (0) {
4008 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4009 for (unsigned int i = 0; i < p->nr_insn; i++) {
4010 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
4011 ((uint32_t *)&p->store[i])[3],
4012 ((uint32_t *)&p->store[i])[2],
4013 ((uint32_t *)&p->store[i])[1],
4014 ((uint32_t *)&p->store[i])[0]);
4015 brw_disasm(stdout, &p->store[i], intel->gen);
4016 }
4017 }
4018 }
4019 }
4020
4021 bool
4022 fs_visitor::run()
4023 {
4024 uint32_t prog_offset_16 = 0;
4025 uint32_t orig_nr_params = c->prog_data.nr_params;
4026
4027 brw_wm_payload_setup(brw, c);
4028
4029 if (c->dispatch_width == 16) {
4030 /* align to 64 byte boundary. */
4031 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
4032 brw_NOP(p);
4033 }
4034
4035 /* Save off the start of this 16-wide program in case we succeed. */
4036 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
4037
4038 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
4039 }
4040
4041 if (0) {
4042 emit_dummy_fs();
4043 } else {
4044 calculate_urb_setup();
4045 if (intel->gen < 6)
4046 emit_interpolation_setup_gen4();
4047 else
4048 emit_interpolation_setup_gen6();
4049
4050 /* Generate FS IR for main(). (the visitor only descends into
4051 * functions called "main").
4052 */
4053 foreach_iter(exec_list_iterator, iter, *shader->ir) {
4054 ir_instruction *ir = (ir_instruction *)iter.get();
4055 base_ir = ir;
4056 ir->accept(this);
4057 }
4058
4059 emit_fb_writes();
4060
4061 split_virtual_grfs();
4062
4063 setup_paramvalues_refs();
4064 setup_pull_constants();
4065
4066 bool progress;
4067 do {
4068 progress = false;
4069
4070 progress = remove_duplicate_mrf_writes() || progress;
4071
4072 progress = propagate_constants() || progress;
4073 progress = register_coalesce() || progress;
4074 progress = compute_to_mrf() || progress;
4075 progress = dead_code_eliminate() || progress;
4076 } while (progress);
4077
4078 schedule_instructions();
4079
4080 assign_curb_setup();
4081 assign_urb_setup();
4082
4083 if (0) {
4084 /* Debug of register spilling: Go spill everything. */
4085 int virtual_grf_count = virtual_grf_next;
4086 for (int i = 1; i < virtual_grf_count; i++) {
4087 spill_reg(i);
4088 }
4089 }
4090
4091 if (0)
4092 assign_regs_trivial();
4093 else {
4094 while (!assign_regs()) {
4095 if (failed)
4096 break;
4097 }
4098 }
4099 }
4100 assert(force_uncompressed_stack == 0);
4101 assert(force_sechalf_stack == 0);
4102
4103 if (failed)
4104 return false;
4105
4106 generate_code();
4107
4108 if (c->dispatch_width == 8) {
4109 c->prog_data.total_grf = grf_used;
4110 } else {
4111 c->prog_data.total_grf_16 = grf_used;
4112 c->prog_data.prog_offset_16 = prog_offset_16;
4113
4114 /* Make sure we didn't try to sneak in an extra uniform */
4115 assert(orig_nr_params == c->prog_data.nr_params);
4116 }
4117
4118 return !failed;
4119 }
4120
4121 bool
4122 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
4123 {
4124 struct intel_context *intel = &brw->intel;
4125 struct gl_context *ctx = &intel->ctx;
4126 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
4127
4128 if (!prog)
4129 return false;
4130
4131 struct brw_shader *shader =
4132 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4133 if (!shader)
4134 return false;
4135
4136 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4137 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
4138 _mesa_print_ir(shader->ir, NULL);
4139 printf("\n\n");
4140 }
4141
4142 /* Now the main event: Visit the shader IR and generate our FS IR for it.
4143 */
4144 c->dispatch_width = 8;
4145
4146 fs_visitor v(c, shader);
4147 if (!v.run()) {
4148 /* FINISHME: Cleanly fail, test at link time, etc. */
4149 assert(!"not reached");
4150 return false;
4151 }
4152
4153 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
4154 c->dispatch_width = 16;
4155 fs_visitor v2(c, shader);
4156 v2.import_uniforms(v.variable_ht);
4157 v2.run();
4158 }
4159
4160 c->prog_data.dispatch_width = 8;
4161
4162 return true;
4163 }