cdc9926c81cf322325eabee8ce8d24158cd6d752
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 extern "C" {
29
30 #include <sys/types.h>
31
32 #include "main/macros.h"
33 #include "main/shaderobj.h"
34 #include "main/uniforms.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "program/prog_optimize.h"
38 #include "program/register_allocate.h"
39 #include "program/sampler.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 }
45 #include "brw_fs.h"
46 #include "../glsl/glsl_types.h"
47 #include "../glsl/ir_optimization.h"
48 #include "../glsl/ir_print_visitor.h"
49
50 #define MAX_INSTRUCTION (1 << 30)
51 static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53 struct gl_shader *
54 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55 {
56 struct brw_shader *shader;
57
58 shader = rzalloc(NULL, struct brw_shader);
59 if (shader) {
60 shader->base.Type = type;
61 shader->base.Name = name;
62 _mesa_init_shader(ctx, &shader->base);
63 }
64
65 return &shader->base;
66 }
67
68 struct gl_shader_program *
69 brw_new_shader_program(struct gl_context *ctx, GLuint name)
70 {
71 struct brw_shader_program *prog;
72 prog = rzalloc(NULL, struct brw_shader_program);
73 if (prog) {
74 prog->base.Name = name;
75 _mesa_init_shader_program(ctx, &prog->base);
76 }
77 return &prog->base;
78 }
79
80 GLboolean
81 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82 {
83 struct brw_context *brw = brw_context(ctx);
84 struct intel_context *intel = &brw->intel;
85
86 struct brw_shader *shader =
87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88 if (shader != NULL) {
89 void *mem_ctx = ralloc_context(NULL);
90 bool progress;
91
92 if (shader->ir)
93 ralloc_free(shader->ir);
94 shader->ir = new(shader) exec_list;
95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97 do_mat_op_to_vec(shader->ir);
98 lower_instructions(shader->ir,
99 MOD_TO_FRACT |
100 DIV_TO_MUL_RCP |
101 SUB_TO_ADD_NEG |
102 EXP_TO_EXP2 |
103 LOG_TO_LOG2);
104
105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this,
106 * if-statements need to be flattened.
107 */
108 if (intel->gen < 6)
109 lower_if_to_cond_assign(shader->ir, 16);
110
111 do_lower_texture_projection(shader->ir);
112 do_vec_index_to_cond_assign(shader->ir);
113 brw_do_cubemap_normalize(shader->ir);
114 lower_noise(shader->ir);
115 lower_quadop_vector(shader->ir, false);
116 lower_variable_index_to_cond_assign(shader->ir,
117 GL_TRUE, /* input */
118 GL_TRUE, /* output */
119 GL_TRUE, /* temp */
120 GL_TRUE /* uniform */
121 );
122
123 do {
124 progress = false;
125
126 brw_do_channel_expressions(shader->ir);
127 brw_do_vector_splitting(shader->ir);
128
129 progress = do_lower_jumps(shader->ir, true, true,
130 true, /* main return */
131 false, /* continue */
132 false /* loops */
133 ) || progress;
134
135 progress = do_common_optimization(shader->ir, true, 32) || progress;
136 } while (progress);
137
138 validate_ir_tree(shader->ir);
139
140 reparent_ir(shader->ir, shader->ir);
141 ralloc_free(mem_ctx);
142 }
143
144 if (!_mesa_ir_link_shader(ctx, prog))
145 return GL_FALSE;
146
147 return GL_TRUE;
148 }
149
150 static int
151 type_size(const struct glsl_type *type)
152 {
153 unsigned int size, i;
154
155 switch (type->base_type) {
156 case GLSL_TYPE_UINT:
157 case GLSL_TYPE_INT:
158 case GLSL_TYPE_FLOAT:
159 case GLSL_TYPE_BOOL:
160 return type->components();
161 case GLSL_TYPE_ARRAY:
162 return type_size(type->fields.array) * type->length;
163 case GLSL_TYPE_STRUCT:
164 size = 0;
165 for (i = 0; i < type->length; i++) {
166 size += type_size(type->fields.structure[i].type);
167 }
168 return size;
169 case GLSL_TYPE_SAMPLER:
170 /* Samplers take up no register space, since they're baked in at
171 * link time.
172 */
173 return 0;
174 default:
175 assert(!"not reached");
176 return 0;
177 }
178 }
179
180 /**
181 * Returns how many MRFs an FS opcode will write over.
182 *
183 * Note that this is not the 0 or 1 implied writes in an actual gen
184 * instruction -- the FS opcodes often generate MOVs in addition.
185 */
186 int
187 fs_visitor::implied_mrf_writes(fs_inst *inst)
188 {
189 if (inst->mlen == 0)
190 return 0;
191
192 switch (inst->opcode) {
193 case FS_OPCODE_RCP:
194 case FS_OPCODE_RSQ:
195 case FS_OPCODE_SQRT:
196 case FS_OPCODE_EXP2:
197 case FS_OPCODE_LOG2:
198 case FS_OPCODE_SIN:
199 case FS_OPCODE_COS:
200 return 1;
201 case FS_OPCODE_POW:
202 return 2;
203 case FS_OPCODE_TEX:
204 case FS_OPCODE_TXB:
205 case FS_OPCODE_TXD:
206 case FS_OPCODE_TXL:
207 return 1;
208 case FS_OPCODE_FB_WRITE:
209 return 2;
210 case FS_OPCODE_PULL_CONSTANT_LOAD:
211 case FS_OPCODE_UNSPILL:
212 return 1;
213 case FS_OPCODE_SPILL:
214 return 2;
215 default:
216 assert(!"not reached");
217 return inst->mlen;
218 }
219 }
220
221 int
222 fs_visitor::virtual_grf_alloc(int size)
223 {
224 if (virtual_grf_array_size <= virtual_grf_next) {
225 if (virtual_grf_array_size == 0)
226 virtual_grf_array_size = 16;
227 else
228 virtual_grf_array_size *= 2;
229 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
230 virtual_grf_array_size);
231
232 /* This slot is always unused. */
233 virtual_grf_sizes[0] = 0;
234 }
235 virtual_grf_sizes[virtual_grf_next] = size;
236 return virtual_grf_next++;
237 }
238
239 /** Fixed HW reg constructor. */
240 fs_reg::fs_reg(enum register_file file, int hw_reg)
241 {
242 init();
243 this->file = file;
244 this->hw_reg = hw_reg;
245 this->type = BRW_REGISTER_TYPE_F;
246 }
247
248 /** Fixed HW reg constructor. */
249 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
250 {
251 init();
252 this->file = file;
253 this->hw_reg = hw_reg;
254 this->type = type;
255 }
256
257 int
258 brw_type_for_base_type(const struct glsl_type *type)
259 {
260 switch (type->base_type) {
261 case GLSL_TYPE_FLOAT:
262 return BRW_REGISTER_TYPE_F;
263 case GLSL_TYPE_INT:
264 case GLSL_TYPE_BOOL:
265 return BRW_REGISTER_TYPE_D;
266 case GLSL_TYPE_UINT:
267 return BRW_REGISTER_TYPE_UD;
268 case GLSL_TYPE_ARRAY:
269 case GLSL_TYPE_STRUCT:
270 case GLSL_TYPE_SAMPLER:
271 /* These should be overridden with the type of the member when
272 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely
273 * way to trip up if we don't.
274 */
275 return BRW_REGISTER_TYPE_UD;
276 default:
277 assert(!"not reached");
278 return BRW_REGISTER_TYPE_F;
279 }
280 }
281
282 /** Automatic reg constructor. */
283 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
284 {
285 init();
286
287 this->file = GRF;
288 this->reg = v->virtual_grf_alloc(type_size(type));
289 this->reg_offset = 0;
290 this->type = brw_type_for_base_type(type);
291 }
292
293 fs_reg *
294 fs_visitor::variable_storage(ir_variable *var)
295 {
296 return (fs_reg *)hash_table_find(this->variable_ht, var);
297 }
298
299 /* Our support for uniforms is piggy-backed on the struct
300 * gl_fragment_program, because that's where the values actually
301 * get stored, rather than in some global gl_shader_program uniform
302 * store.
303 */
304 int
305 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
306 {
307 unsigned int offset = 0;
308
309 if (type->is_matrix()) {
310 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
311 type->vector_elements,
312 1);
313
314 for (unsigned int i = 0; i < type->matrix_columns; i++) {
315 offset += setup_uniform_values(loc + offset, column);
316 }
317
318 return offset;
319 }
320
321 switch (type->base_type) {
322 case GLSL_TYPE_FLOAT:
323 case GLSL_TYPE_UINT:
324 case GLSL_TYPE_INT:
325 case GLSL_TYPE_BOOL:
326 for (unsigned int i = 0; i < type->vector_elements; i++) {
327 unsigned int param = c->prog_data.nr_params++;
328
329 assert(param < ARRAY_SIZE(c->prog_data.param));
330
331 switch (type->base_type) {
332 case GLSL_TYPE_FLOAT:
333 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
334 break;
335 case GLSL_TYPE_UINT:
336 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
337 break;
338 case GLSL_TYPE_INT:
339 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
340 break;
341 case GLSL_TYPE_BOOL:
342 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
343 break;
344 default:
345 assert(!"not reached");
346 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
347 break;
348 }
349 this->param_index[param] = loc;
350 this->param_offset[param] = i;
351 }
352 return 1;
353
354 case GLSL_TYPE_STRUCT:
355 for (unsigned int i = 0; i < type->length; i++) {
356 offset += setup_uniform_values(loc + offset,
357 type->fields.structure[i].type);
358 }
359 return offset;
360
361 case GLSL_TYPE_ARRAY:
362 for (unsigned int i = 0; i < type->length; i++) {
363 offset += setup_uniform_values(loc + offset, type->fields.array);
364 }
365 return offset;
366
367 case GLSL_TYPE_SAMPLER:
368 /* The sampler takes up a slot, but we don't use any values from it. */
369 return 1;
370
371 default:
372 assert(!"not reached");
373 return 0;
374 }
375 }
376
377
378 /* Our support for builtin uniforms is even scarier than non-builtin.
379 * It sits on top of the PROG_STATE_VAR parameters that are
380 * automatically updated from GL context state.
381 */
382 void
383 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
384 {
385 const struct gl_builtin_uniform_desc *statevar = NULL;
386
387 for (unsigned int i = 0; _mesa_builtin_uniform_desc[i].name; i++) {
388 statevar = &_mesa_builtin_uniform_desc[i];
389 if (strcmp(ir->name, _mesa_builtin_uniform_desc[i].name) == 0)
390 break;
391 }
392
393 if (!statevar->name) {
394 this->fail = true;
395 printf("Failed to find builtin uniform `%s'\n", ir->name);
396 return;
397 }
398
399 int array_count;
400 if (ir->type->is_array()) {
401 array_count = ir->type->length;
402 } else {
403 array_count = 1;
404 }
405
406 for (int a = 0; a < array_count; a++) {
407 for (unsigned int i = 0; i < statevar->num_elements; i++) {
408 struct gl_builtin_uniform_element *element = &statevar->elements[i];
409 int tokens[STATE_LENGTH];
410
411 memcpy(tokens, element->tokens, sizeof(element->tokens));
412 if (ir->type->is_array()) {
413 tokens[1] = a;
414 }
415
416 /* This state reference has already been setup by ir_to_mesa,
417 * but we'll get the same index back here.
418 */
419 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
420 (gl_state_index *)tokens);
421
422 /* Add each of the unique swizzles of the element as a
423 * parameter. This'll end up matching the expected layout of
424 * the array/matrix/structure we're trying to fill in.
425 */
426 int last_swiz = -1;
427 for (unsigned int i = 0; i < 4; i++) {
428 int swiz = GET_SWZ(element->swizzle, i);
429 if (swiz == last_swiz)
430 break;
431 last_swiz = swiz;
432
433 c->prog_data.param_convert[c->prog_data.nr_params] =
434 PARAM_NO_CONVERT;
435 this->param_index[c->prog_data.nr_params] = index;
436 this->param_offset[c->prog_data.nr_params] = swiz;
437 c->prog_data.nr_params++;
438 }
439 }
440 }
441 }
442
443 fs_reg *
444 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
445 {
446 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
447 fs_reg wpos = *reg;
448 fs_reg neg_y = this->pixel_y;
449 neg_y.negate = true;
450 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
451
452 /* gl_FragCoord.x */
453 if (ir->pixel_center_integer) {
454 emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
455 } else {
456 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
457 }
458 wpos.reg_offset++;
459
460 /* gl_FragCoord.y */
461 if (!flip && ir->pixel_center_integer) {
462 emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
463 } else {
464 fs_reg pixel_y = this->pixel_y;
465 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
466
467 if (flip) {
468 pixel_y.negate = true;
469 offset += c->key.drawable_height - 1.0;
470 }
471
472 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
473 }
474 wpos.reg_offset++;
475
476 /* gl_FragCoord.z */
477 if (intel->gen >= 6) {
478 emit(BRW_OPCODE_MOV, wpos,
479 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
480 } else {
481 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
482 interp_reg(FRAG_ATTRIB_WPOS, 2));
483 }
484 wpos.reg_offset++;
485
486 /* gl_FragCoord.w: Already set up in emit_interpolation */
487 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
488
489 return reg;
490 }
491
492 fs_reg *
493 fs_visitor::emit_general_interpolation(ir_variable *ir)
494 {
495 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
496 /* Interpolation is always in floating point regs. */
497 reg->type = BRW_REGISTER_TYPE_F;
498 fs_reg attr = *reg;
499
500 unsigned int array_elements;
501 const glsl_type *type;
502
503 if (ir->type->is_array()) {
504 array_elements = ir->type->length;
505 if (array_elements == 0) {
506 this->fail = true;
507 }
508 type = ir->type->fields.array;
509 } else {
510 array_elements = 1;
511 type = ir->type;
512 }
513
514 int location = ir->location;
515 for (unsigned int i = 0; i < array_elements; i++) {
516 for (unsigned int j = 0; j < type->matrix_columns; j++) {
517 if (urb_setup[location] == -1) {
518 /* If there's no incoming setup data for this slot, don't
519 * emit interpolation for it.
520 */
521 attr.reg_offset += type->vector_elements;
522 location++;
523 continue;
524 }
525
526 if (c->key.flat_shade && (location == FRAG_ATTRIB_COL0 ||
527 location == FRAG_ATTRIB_COL1)) {
528 /* Constant interpolation (flat shading) case. The SF has
529 * handed us defined values in only the constant offset
530 * field of the setup reg.
531 */
532 for (unsigned int c = 0; c < type->vector_elements; c++) {
533 struct brw_reg interp = interp_reg(location, c);
534 interp = suboffset(interp, 3);
535 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
536 attr.reg_offset++;
537 }
538 } else {
539 /* Perspective interpolation case. */
540 for (unsigned int c = 0; c < type->vector_elements; c++) {
541 struct brw_reg interp = interp_reg(location, c);
542 emit(FS_OPCODE_LINTERP, attr,
543 this->delta_x, this->delta_y, fs_reg(interp));
544 attr.reg_offset++;
545 }
546
547 if (intel->gen < 6) {
548 attr.reg_offset -= type->vector_elements;
549 for (unsigned int c = 0; c < type->vector_elements; c++) {
550 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
551 attr.reg_offset++;
552 }
553 }
554 }
555 location++;
556 }
557 }
558
559 return reg;
560 }
561
562 fs_reg *
563 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
564 {
565 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
566
567 /* The frontfacing comes in as a bit in the thread payload. */
568 if (intel->gen >= 6) {
569 emit(BRW_OPCODE_ASR, *reg,
570 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
571 fs_reg(15));
572 emit(BRW_OPCODE_NOT, *reg, *reg);
573 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
574 } else {
575 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
576 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
577 * us front face
578 */
579 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
580 fs_reg(r1_6ud),
581 fs_reg(1u << 31));
582 inst->conditional_mod = BRW_CONDITIONAL_L;
583 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
584 }
585
586 return reg;
587 }
588
589 fs_inst *
590 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
591 {
592 switch (opcode) {
593 case FS_OPCODE_RCP:
594 case FS_OPCODE_RSQ:
595 case FS_OPCODE_SQRT:
596 case FS_OPCODE_EXP2:
597 case FS_OPCODE_LOG2:
598 case FS_OPCODE_SIN:
599 case FS_OPCODE_COS:
600 break;
601 default:
602 assert(!"not reached: bad math opcode");
603 return NULL;
604 }
605
606 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
607 * might be able to do better by doing execsize = 1 math and then
608 * expanding that result out, but we would need to be careful with
609 * masking.
610 *
611 * The hardware ignores source modifiers (negate and abs) on math
612 * instructions, so we also move to a temp to set those up.
613 */
614 if (intel->gen >= 6 && (src.file == UNIFORM ||
615 src.abs ||
616 src.negate)) {
617 fs_reg expanded = fs_reg(this, glsl_type::float_type);
618 emit(BRW_OPCODE_MOV, expanded, src);
619 src = expanded;
620 }
621
622 fs_inst *inst = emit(opcode, dst, src);
623
624 if (intel->gen < 6) {
625 inst->base_mrf = 2;
626 inst->mlen = 1;
627 }
628
629 return inst;
630 }
631
632 fs_inst *
633 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
634 {
635 int base_mrf = 2;
636 fs_inst *inst;
637
638 assert(opcode == FS_OPCODE_POW);
639
640 if (intel->gen >= 6) {
641 /* Can't do hstride == 0 args to gen6 math, so expand it out.
642 *
643 * The hardware ignores source modifiers (negate and abs) on math
644 * instructions, so we also move to a temp to set those up.
645 */
646 if (src0.file == UNIFORM || src0.abs || src0.negate) {
647 fs_reg expanded = fs_reg(this, glsl_type::float_type);
648 emit(BRW_OPCODE_MOV, expanded, src0);
649 src0 = expanded;
650 }
651
652 if (src1.file == UNIFORM || src1.abs || src1.negate) {
653 fs_reg expanded = fs_reg(this, glsl_type::float_type);
654 emit(BRW_OPCODE_MOV, expanded, src1);
655 src1 = expanded;
656 }
657
658 inst = emit(opcode, dst, src0, src1);
659 } else {
660 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
661 inst = emit(opcode, dst, src0, reg_null_f);
662
663 inst->base_mrf = base_mrf;
664 inst->mlen = 2;
665 }
666 return inst;
667 }
668
669 void
670 fs_visitor::visit(ir_variable *ir)
671 {
672 fs_reg *reg = NULL;
673
674 if (variable_storage(ir))
675 return;
676
677 if (strcmp(ir->name, "gl_FragColor") == 0) {
678 this->frag_color = ir;
679 } else if (strcmp(ir->name, "gl_FragData") == 0) {
680 this->frag_data = ir;
681 } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
682 this->frag_depth = ir;
683 }
684
685 if (ir->mode == ir_var_in) {
686 if (!strcmp(ir->name, "gl_FragCoord")) {
687 reg = emit_fragcoord_interpolation(ir);
688 } else if (!strcmp(ir->name, "gl_FrontFacing")) {
689 reg = emit_frontfacing_interpolation(ir);
690 } else {
691 reg = emit_general_interpolation(ir);
692 }
693 assert(reg);
694 hash_table_insert(this->variable_ht, reg, ir);
695 return;
696 }
697
698 if (ir->mode == ir_var_uniform) {
699 int param_index = c->prog_data.nr_params;
700
701 if (!strncmp(ir->name, "gl_", 3)) {
702 setup_builtin_uniform_values(ir);
703 } else {
704 setup_uniform_values(ir->location, ir->type);
705 }
706
707 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
708 reg->type = brw_type_for_base_type(ir->type);
709 }
710
711 if (!reg)
712 reg = new(this->mem_ctx) fs_reg(this, ir->type);
713
714 hash_table_insert(this->variable_ht, reg, ir);
715 }
716
717 void
718 fs_visitor::visit(ir_dereference_variable *ir)
719 {
720 fs_reg *reg = variable_storage(ir->var);
721 this->result = *reg;
722 }
723
724 void
725 fs_visitor::visit(ir_dereference_record *ir)
726 {
727 const glsl_type *struct_type = ir->record->type;
728
729 ir->record->accept(this);
730
731 unsigned int offset = 0;
732 for (unsigned int i = 0; i < struct_type->length; i++) {
733 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
734 break;
735 offset += type_size(struct_type->fields.structure[i].type);
736 }
737 this->result.reg_offset += offset;
738 this->result.type = brw_type_for_base_type(ir->type);
739 }
740
741 void
742 fs_visitor::visit(ir_dereference_array *ir)
743 {
744 ir_constant *index;
745 int element_size;
746
747 ir->array->accept(this);
748 index = ir->array_index->as_constant();
749
750 element_size = type_size(ir->type);
751 this->result.type = brw_type_for_base_type(ir->type);
752
753 if (index) {
754 assert(this->result.file == UNIFORM ||
755 (this->result.file == GRF &&
756 this->result.reg != 0));
757 this->result.reg_offset += index->value.i[0] * element_size;
758 } else {
759 assert(!"FINISHME: non-constant array element");
760 }
761 }
762
763 /* Instruction selection: Produce a MOV.sat instead of
764 * MIN(MAX(val, 0), 1) when possible.
765 */
766 bool
767 fs_visitor::try_emit_saturate(ir_expression *ir)
768 {
769 ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
770
771 if (!sat_val)
772 return false;
773
774 sat_val->accept(this);
775 fs_reg src = this->result;
776
777 this->result = fs_reg(this, ir->type);
778 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
779 inst->saturate = true;
780
781 return true;
782 }
783
784 static uint32_t
785 brw_conditional_for_comparison(unsigned int op)
786 {
787 switch (op) {
788 case ir_binop_less:
789 return BRW_CONDITIONAL_L;
790 case ir_binop_greater:
791 return BRW_CONDITIONAL_G;
792 case ir_binop_lequal:
793 return BRW_CONDITIONAL_LE;
794 case ir_binop_gequal:
795 return BRW_CONDITIONAL_GE;
796 case ir_binop_equal:
797 case ir_binop_all_equal: /* same as equal for scalars */
798 return BRW_CONDITIONAL_Z;
799 case ir_binop_nequal:
800 case ir_binop_any_nequal: /* same as nequal for scalars */
801 return BRW_CONDITIONAL_NZ;
802 default:
803 assert(!"not reached: bad operation for comparison");
804 return BRW_CONDITIONAL_NZ;
805 }
806 }
807
808 void
809 fs_visitor::visit(ir_expression *ir)
810 {
811 unsigned int operand;
812 fs_reg op[2], temp;
813 fs_inst *inst;
814
815 assert(ir->get_num_operands() <= 2);
816
817 if (try_emit_saturate(ir))
818 return;
819
820 for (operand = 0; operand < ir->get_num_operands(); operand++) {
821 ir->operands[operand]->accept(this);
822 if (this->result.file == BAD_FILE) {
823 ir_print_visitor v;
824 printf("Failed to get tree for expression operand:\n");
825 ir->operands[operand]->accept(&v);
826 this->fail = true;
827 }
828 op[operand] = this->result;
829
830 /* Matrix expression operands should have been broken down to vector
831 * operations already.
832 */
833 assert(!ir->operands[operand]->type->is_matrix());
834 /* And then those vector operands should have been broken down to scalar.
835 */
836 assert(!ir->operands[operand]->type->is_vector());
837 }
838
839 /* Storage for our result. If our result goes into an assignment, it will
840 * just get copy-propagated out, so no worries.
841 */
842 this->result = fs_reg(this, ir->type);
843
844 switch (ir->operation) {
845 case ir_unop_logic_not:
846 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
847 * ones complement of the whole register, not just bit 0.
848 */
849 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
850 break;
851 case ir_unop_neg:
852 op[0].negate = !op[0].negate;
853 this->result = op[0];
854 break;
855 case ir_unop_abs:
856 op[0].abs = true;
857 op[0].negate = false;
858 this->result = op[0];
859 break;
860 case ir_unop_sign:
861 temp = fs_reg(this, ir->type);
862
863 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
864
865 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
866 inst->conditional_mod = BRW_CONDITIONAL_G;
867 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
868 inst->predicated = true;
869
870 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
871 inst->conditional_mod = BRW_CONDITIONAL_L;
872 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
873 inst->predicated = true;
874
875 break;
876 case ir_unop_rcp:
877 emit_math(FS_OPCODE_RCP, this->result, op[0]);
878 break;
879
880 case ir_unop_exp2:
881 emit_math(FS_OPCODE_EXP2, this->result, op[0]);
882 break;
883 case ir_unop_log2:
884 emit_math(FS_OPCODE_LOG2, this->result, op[0]);
885 break;
886 case ir_unop_exp:
887 case ir_unop_log:
888 assert(!"not reached: should be handled by ir_explog_to_explog2");
889 break;
890 case ir_unop_sin:
891 case ir_unop_sin_reduced:
892 emit_math(FS_OPCODE_SIN, this->result, op[0]);
893 break;
894 case ir_unop_cos:
895 case ir_unop_cos_reduced:
896 emit_math(FS_OPCODE_COS, this->result, op[0]);
897 break;
898
899 case ir_unop_dFdx:
900 emit(FS_OPCODE_DDX, this->result, op[0]);
901 break;
902 case ir_unop_dFdy:
903 emit(FS_OPCODE_DDY, this->result, op[0]);
904 break;
905
906 case ir_binop_add:
907 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
908 break;
909 case ir_binop_sub:
910 assert(!"not reached: should be handled by ir_sub_to_add_neg");
911 break;
912
913 case ir_binop_mul:
914 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
915 break;
916 case ir_binop_div:
917 assert(!"not reached: should be handled by ir_div_to_mul_rcp");
918 break;
919 case ir_binop_mod:
920 assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
921 break;
922
923 case ir_binop_less:
924 case ir_binop_greater:
925 case ir_binop_lequal:
926 case ir_binop_gequal:
927 case ir_binop_equal:
928 case ir_binop_all_equal:
929 case ir_binop_nequal:
930 case ir_binop_any_nequal:
931 temp = this->result;
932 /* original gen4 does implicit conversion before comparison. */
933 if (intel->gen < 5)
934 temp.type = op[0].type;
935
936 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
937 inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
938 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
939 break;
940
941 case ir_binop_logic_xor:
942 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
943 break;
944
945 case ir_binop_logic_or:
946 emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
947 break;
948
949 case ir_binop_logic_and:
950 emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
951 break;
952
953 case ir_binop_dot:
954 case ir_unop_any:
955 assert(!"not reached: should be handled by brw_fs_channel_expressions");
956 break;
957
958 case ir_unop_noise:
959 assert(!"not reached: should be handled by lower_noise");
960 break;
961
962 case ir_quadop_vector:
963 assert(!"not reached: should be handled by lower_quadop_vector");
964 break;
965
966 case ir_unop_sqrt:
967 emit_math(FS_OPCODE_SQRT, this->result, op[0]);
968 break;
969
970 case ir_unop_rsq:
971 emit_math(FS_OPCODE_RSQ, this->result, op[0]);
972 break;
973
974 case ir_unop_i2f:
975 case ir_unop_b2f:
976 case ir_unop_b2i:
977 case ir_unop_f2i:
978 emit(BRW_OPCODE_MOV, this->result, op[0]);
979 break;
980 case ir_unop_f2b:
981 case ir_unop_i2b:
982 temp = this->result;
983 /* original gen4 does implicit conversion before comparison. */
984 if (intel->gen < 5)
985 temp.type = op[0].type;
986
987 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
988 inst->conditional_mod = BRW_CONDITIONAL_NZ;
989 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
990 break;
991
992 case ir_unop_trunc:
993 emit(BRW_OPCODE_RNDZ, this->result, op[0]);
994 break;
995 case ir_unop_ceil:
996 op[0].negate = !op[0].negate;
997 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
998 this->result.negate = true;
999 break;
1000 case ir_unop_floor:
1001 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
1002 break;
1003 case ir_unop_fract:
1004 inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
1005 break;
1006 case ir_unop_round_even:
1007 emit(BRW_OPCODE_RNDE, this->result, op[0]);
1008 break;
1009
1010 case ir_binop_min:
1011 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1012 inst->conditional_mod = BRW_CONDITIONAL_L;
1013
1014 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1015 inst->predicated = true;
1016 break;
1017 case ir_binop_max:
1018 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1019 inst->conditional_mod = BRW_CONDITIONAL_G;
1020
1021 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1022 inst->predicated = true;
1023 break;
1024
1025 case ir_binop_pow:
1026 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1027 break;
1028
1029 case ir_unop_bit_not:
1030 inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1031 break;
1032 case ir_binop_bit_and:
1033 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1034 break;
1035 case ir_binop_bit_xor:
1036 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1037 break;
1038 case ir_binop_bit_or:
1039 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1040 break;
1041
1042 case ir_unop_u2f:
1043 case ir_binop_lshift:
1044 case ir_binop_rshift:
1045 assert(!"GLSL 1.30 features unsupported");
1046 break;
1047 }
1048 }
1049
1050 void
1051 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1052 const glsl_type *type, bool predicated)
1053 {
1054 switch (type->base_type) {
1055 case GLSL_TYPE_FLOAT:
1056 case GLSL_TYPE_UINT:
1057 case GLSL_TYPE_INT:
1058 case GLSL_TYPE_BOOL:
1059 for (unsigned int i = 0; i < type->components(); i++) {
1060 l.type = brw_type_for_base_type(type);
1061 r.type = brw_type_for_base_type(type);
1062
1063 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1064 inst->predicated = predicated;
1065
1066 l.reg_offset++;
1067 r.reg_offset++;
1068 }
1069 break;
1070 case GLSL_TYPE_ARRAY:
1071 for (unsigned int i = 0; i < type->length; i++) {
1072 emit_assignment_writes(l, r, type->fields.array, predicated);
1073 }
1074 break;
1075
1076 case GLSL_TYPE_STRUCT:
1077 for (unsigned int i = 0; i < type->length; i++) {
1078 emit_assignment_writes(l, r, type->fields.structure[i].type,
1079 predicated);
1080 }
1081 break;
1082
1083 case GLSL_TYPE_SAMPLER:
1084 break;
1085
1086 default:
1087 assert(!"not reached");
1088 break;
1089 }
1090 }
1091
1092 void
1093 fs_visitor::visit(ir_assignment *ir)
1094 {
1095 struct fs_reg l, r;
1096 fs_inst *inst;
1097
1098 /* FINISHME: arrays on the lhs */
1099 ir->lhs->accept(this);
1100 l = this->result;
1101
1102 ir->rhs->accept(this);
1103 r = this->result;
1104
1105 assert(l.file != BAD_FILE);
1106 assert(r.file != BAD_FILE);
1107
1108 if (ir->condition) {
1109 emit_bool_to_cond_code(ir->condition);
1110 }
1111
1112 if (ir->lhs->type->is_scalar() ||
1113 ir->lhs->type->is_vector()) {
1114 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1115 if (ir->write_mask & (1 << i)) {
1116 inst = emit(BRW_OPCODE_MOV, l, r);
1117 if (ir->condition)
1118 inst->predicated = true;
1119 r.reg_offset++;
1120 }
1121 l.reg_offset++;
1122 }
1123 } else {
1124 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1125 }
1126 }
1127
1128 fs_inst *
1129 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1130 {
1131 int mlen;
1132 int base_mrf = 1;
1133 bool simd16 = false;
1134 fs_reg orig_dst;
1135
1136 /* g0 header. */
1137 mlen = 1;
1138
1139 if (ir->shadow_comparitor) {
1140 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1141 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1142 coordinate.reg_offset++;
1143 }
1144 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1145 mlen += 3;
1146
1147 if (ir->op == ir_tex) {
1148 /* There's no plain shadow compare message, so we use shadow
1149 * compare with a bias of 0.0.
1150 */
1151 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1152 mlen++;
1153 } else if (ir->op == ir_txb) {
1154 ir->lod_info.bias->accept(this);
1155 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1156 mlen++;
1157 } else {
1158 assert(ir->op == ir_txl);
1159 ir->lod_info.lod->accept(this);
1160 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1161 mlen++;
1162 }
1163
1164 ir->shadow_comparitor->accept(this);
1165 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1166 mlen++;
1167 } else if (ir->op == ir_tex) {
1168 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1169 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1170 coordinate.reg_offset++;
1171 }
1172 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1173 mlen += 3;
1174 } else if (ir->op == ir_txd) {
1175 assert(!"TXD isn't supported on gen4 yet.");
1176 } else {
1177 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1178 * instructions. We'll need to do SIMD16 here.
1179 */
1180 assert(ir->op == ir_txb || ir->op == ir_txl);
1181
1182 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1183 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1184 coordinate.reg_offset++;
1185 }
1186
1187 /* lod/bias appears after u/v/r. */
1188 mlen += 6;
1189
1190 if (ir->op == ir_txb) {
1191 ir->lod_info.bias->accept(this);
1192 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1193 mlen++;
1194 } else {
1195 ir->lod_info.lod->accept(this);
1196 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1197 mlen++;
1198 }
1199
1200 /* The unused upper half. */
1201 mlen++;
1202
1203 /* Now, since we're doing simd16, the return is 2 interleaved
1204 * vec4s where the odd-indexed ones are junk. We'll need to move
1205 * this weirdness around to the expected layout.
1206 */
1207 simd16 = true;
1208 orig_dst = dst;
1209 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1210 2));
1211 dst.type = BRW_REGISTER_TYPE_F;
1212 }
1213
1214 fs_inst *inst = NULL;
1215 switch (ir->op) {
1216 case ir_tex:
1217 inst = emit(FS_OPCODE_TEX, dst);
1218 break;
1219 case ir_txb:
1220 inst = emit(FS_OPCODE_TXB, dst);
1221 break;
1222 case ir_txl:
1223 inst = emit(FS_OPCODE_TXL, dst);
1224 break;
1225 case ir_txd:
1226 inst = emit(FS_OPCODE_TXD, dst);
1227 break;
1228 case ir_txf:
1229 assert(!"GLSL 1.30 features unsupported");
1230 break;
1231 }
1232 inst->base_mrf = base_mrf;
1233 inst->mlen = mlen;
1234
1235 if (simd16) {
1236 for (int i = 0; i < 4; i++) {
1237 emit(BRW_OPCODE_MOV, orig_dst, dst);
1238 orig_dst.reg_offset++;
1239 dst.reg_offset += 2;
1240 }
1241 }
1242
1243 return inst;
1244 }
1245
1246 fs_inst *
1247 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1248 {
1249 /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1250 * optional parameters like shadow comparitor or LOD bias. If
1251 * optional parameters aren't present, those base slots are
1252 * optional and don't need to be included in the message.
1253 *
1254 * We don't fill in the unnecessary slots regardless, which may
1255 * look surprising in the disassembly.
1256 */
1257 int mlen = 1; /* g0 header always present. */
1258 int base_mrf = 1;
1259
1260 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1261 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1262 coordinate.reg_offset++;
1263 }
1264 mlen += ir->coordinate->type->vector_elements;
1265
1266 if (ir->shadow_comparitor) {
1267 mlen = MAX2(mlen, 5);
1268
1269 ir->shadow_comparitor->accept(this);
1270 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1271 mlen++;
1272 }
1273
1274 fs_inst *inst = NULL;
1275 switch (ir->op) {
1276 case ir_tex:
1277 inst = emit(FS_OPCODE_TEX, dst);
1278 break;
1279 case ir_txb:
1280 ir->lod_info.bias->accept(this);
1281 mlen = MAX2(mlen, 5);
1282 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1283 mlen++;
1284
1285 inst = emit(FS_OPCODE_TXB, dst);
1286 break;
1287 case ir_txl:
1288 ir->lod_info.lod->accept(this);
1289 mlen = MAX2(mlen, 5);
1290 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1291 mlen++;
1292
1293 inst = emit(FS_OPCODE_TXL, dst);
1294 break;
1295 case ir_txd:
1296 case ir_txf:
1297 assert(!"GLSL 1.30 features unsupported");
1298 break;
1299 }
1300 inst->base_mrf = base_mrf;
1301 inst->mlen = mlen;
1302
1303 return inst;
1304 }
1305
1306 void
1307 fs_visitor::visit(ir_texture *ir)
1308 {
1309 int sampler;
1310 fs_inst *inst = NULL;
1311
1312 ir->coordinate->accept(this);
1313 fs_reg coordinate = this->result;
1314
1315 if (ir->offset != NULL) {
1316 ir_constant *offset = ir->offset->as_constant();
1317 assert(offset != NULL);
1318
1319 signed char offsets[3];
1320 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1321 offsets[i] = (signed char) offset->value.i[i];
1322
1323 /* Combine all three offsets into a single unsigned dword:
1324 *
1325 * bits 11:8 - U Offset (X component)
1326 * bits 7:4 - V Offset (Y component)
1327 * bits 3:0 - R Offset (Z component)
1328 */
1329 unsigned offset_bits = 0;
1330 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1331 const unsigned shift = 4 * (2 - i);
1332 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1333 }
1334
1335 /* Explicitly set up the message header by copying g0 to msg reg m1. */
1336 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1337 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1338
1339 /* Then set the offset bits in DWord 2 of the message header. */
1340 emit(BRW_OPCODE_MOV,
1341 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1342 BRW_REGISTER_TYPE_UD)),
1343 fs_reg(brw_imm_uw(offset_bits)));
1344 }
1345
1346 /* Should be lowered by do_lower_texture_projection */
1347 assert(!ir->projector);
1348
1349 sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1350 ctx->Shader.CurrentFragmentProgram,
1351 &brw->fragment_program->Base);
1352 sampler = c->fp->program.Base.SamplerUnits[sampler];
1353
1354 /* The 965 requires the EU to do the normalization of GL rectangle
1355 * texture coordinates. We use the program parameter state
1356 * tracking to get the scaling factor.
1357 */
1358 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1359 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1360 int tokens[STATE_LENGTH] = {
1361 STATE_INTERNAL,
1362 STATE_TEXRECT_SCALE,
1363 sampler,
1364 0,
1365 0
1366 };
1367
1368 c->prog_data.param_convert[c->prog_data.nr_params] =
1369 PARAM_NO_CONVERT;
1370 c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1371 PARAM_NO_CONVERT;
1372
1373 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1374 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1375 GLuint index = _mesa_add_state_reference(params,
1376 (gl_state_index *)tokens);
1377
1378 this->param_index[c->prog_data.nr_params] = index;
1379 this->param_offset[c->prog_data.nr_params] = 0;
1380 c->prog_data.nr_params++;
1381 this->param_index[c->prog_data.nr_params] = index;
1382 this->param_offset[c->prog_data.nr_params] = 1;
1383 c->prog_data.nr_params++;
1384
1385 fs_reg dst = fs_reg(this, ir->coordinate->type);
1386 fs_reg src = coordinate;
1387 coordinate = dst;
1388
1389 emit(BRW_OPCODE_MUL, dst, src, scale_x);
1390 dst.reg_offset++;
1391 src.reg_offset++;
1392 emit(BRW_OPCODE_MUL, dst, src, scale_y);
1393 }
1394
1395 /* Writemasking doesn't eliminate channels on SIMD8 texture
1396 * samples, so don't worry about them.
1397 */
1398 fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1399
1400 if (intel->gen < 5) {
1401 inst = emit_texture_gen4(ir, dst, coordinate);
1402 } else {
1403 inst = emit_texture_gen5(ir, dst, coordinate);
1404 }
1405
1406 /* If there's an offset, we already set up m1. To avoid the implied move,
1407 * use the null register. Otherwise, we want an implied move from g0.
1408 */
1409 if (ir->offset != NULL)
1410 inst->src[0] = fs_reg(brw_null_reg());
1411 else
1412 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1413
1414 inst->sampler = sampler;
1415
1416 this->result = dst;
1417
1418 if (ir->shadow_comparitor)
1419 inst->shadow_compare = true;
1420
1421 if (ir->type == glsl_type::float_type) {
1422 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1423 assert(ir->sampler->type->sampler_shadow);
1424 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1425 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1426
1427 for (int i = 0; i < 4; i++) {
1428 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1429 fs_reg l = swizzle_dst;
1430 l.reg_offset += i;
1431
1432 if (swiz == SWIZZLE_ZERO) {
1433 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1434 } else if (swiz == SWIZZLE_ONE) {
1435 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1436 } else {
1437 fs_reg r = dst;
1438 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1439 emit(BRW_OPCODE_MOV, l, r);
1440 }
1441 }
1442 this->result = swizzle_dst;
1443 }
1444 }
1445
1446 void
1447 fs_visitor::visit(ir_swizzle *ir)
1448 {
1449 ir->val->accept(this);
1450 fs_reg val = this->result;
1451
1452 if (ir->type->vector_elements == 1) {
1453 this->result.reg_offset += ir->mask.x;
1454 return;
1455 }
1456
1457 fs_reg result = fs_reg(this, ir->type);
1458 this->result = result;
1459
1460 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1461 fs_reg channel = val;
1462 int swiz = 0;
1463
1464 switch (i) {
1465 case 0:
1466 swiz = ir->mask.x;
1467 break;
1468 case 1:
1469 swiz = ir->mask.y;
1470 break;
1471 case 2:
1472 swiz = ir->mask.z;
1473 break;
1474 case 3:
1475 swiz = ir->mask.w;
1476 break;
1477 }
1478
1479 channel.reg_offset += swiz;
1480 emit(BRW_OPCODE_MOV, result, channel);
1481 result.reg_offset++;
1482 }
1483 }
1484
1485 void
1486 fs_visitor::visit(ir_discard *ir)
1487 {
1488 fs_reg temp = fs_reg(this, glsl_type::uint_type);
1489
1490 assert(ir->condition == NULL); /* FINISHME */
1491
1492 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1493 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1494 kill_emitted = true;
1495 }
1496
1497 void
1498 fs_visitor::visit(ir_constant *ir)
1499 {
1500 /* Set this->result to reg at the bottom of the function because some code
1501 * paths will cause this visitor to be applied to other fields. This will
1502 * cause the value stored in this->result to be modified.
1503 *
1504 * Make reg constant so that it doesn't get accidentally modified along the
1505 * way. Yes, I actually had this problem. :(
1506 */
1507 const fs_reg reg(this, ir->type);
1508 fs_reg dst_reg = reg;
1509
1510 if (ir->type->is_array()) {
1511 const unsigned size = type_size(ir->type->fields.array);
1512
1513 for (unsigned i = 0; i < ir->type->length; i++) {
1514 ir->array_elements[i]->accept(this);
1515 fs_reg src_reg = this->result;
1516
1517 dst_reg.type = src_reg.type;
1518 for (unsigned j = 0; j < size; j++) {
1519 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1520 src_reg.reg_offset++;
1521 dst_reg.reg_offset++;
1522 }
1523 }
1524 } else if (ir->type->is_record()) {
1525 foreach_list(node, &ir->components) {
1526 ir_instruction *const field = (ir_instruction *) node;
1527 const unsigned size = type_size(field->type);
1528
1529 field->accept(this);
1530 fs_reg src_reg = this->result;
1531
1532 dst_reg.type = src_reg.type;
1533 for (unsigned j = 0; j < size; j++) {
1534 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1535 src_reg.reg_offset++;
1536 dst_reg.reg_offset++;
1537 }
1538 }
1539 } else {
1540 const unsigned size = type_size(ir->type);
1541
1542 for (unsigned i = 0; i < size; i++) {
1543 switch (ir->type->base_type) {
1544 case GLSL_TYPE_FLOAT:
1545 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1546 break;
1547 case GLSL_TYPE_UINT:
1548 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1549 break;
1550 case GLSL_TYPE_INT:
1551 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1552 break;
1553 case GLSL_TYPE_BOOL:
1554 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1555 break;
1556 default:
1557 assert(!"Non-float/uint/int/bool constant");
1558 }
1559 dst_reg.reg_offset++;
1560 }
1561 }
1562
1563 this->result = reg;
1564 }
1565
1566 void
1567 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1568 {
1569 ir_expression *expr = ir->as_expression();
1570
1571 if (expr) {
1572 fs_reg op[2];
1573 fs_inst *inst;
1574
1575 assert(expr->get_num_operands() <= 2);
1576 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1577 assert(expr->operands[i]->type->is_scalar());
1578
1579 expr->operands[i]->accept(this);
1580 op[i] = this->result;
1581 }
1582
1583 switch (expr->operation) {
1584 case ir_unop_logic_not:
1585 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1586 inst->conditional_mod = BRW_CONDITIONAL_Z;
1587 break;
1588
1589 case ir_binop_logic_xor:
1590 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1591 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1592 break;
1593
1594 case ir_binop_logic_or:
1595 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1596 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1597 break;
1598
1599 case ir_binop_logic_and:
1600 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1601 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1602 break;
1603
1604 case ir_unop_f2b:
1605 if (intel->gen >= 6) {
1606 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1607 } else {
1608 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1609 }
1610 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1611 break;
1612
1613 case ir_unop_i2b:
1614 if (intel->gen >= 6) {
1615 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1616 } else {
1617 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1618 }
1619 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1620 break;
1621
1622 case ir_binop_greater:
1623 case ir_binop_gequal:
1624 case ir_binop_less:
1625 case ir_binop_lequal:
1626 case ir_binop_equal:
1627 case ir_binop_all_equal:
1628 case ir_binop_nequal:
1629 case ir_binop_any_nequal:
1630 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1631 inst->conditional_mod =
1632 brw_conditional_for_comparison(expr->operation);
1633 break;
1634
1635 default:
1636 assert(!"not reached");
1637 this->fail = true;
1638 break;
1639 }
1640 return;
1641 }
1642
1643 ir->accept(this);
1644
1645 if (intel->gen >= 6) {
1646 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1647 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1648 } else {
1649 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1650 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1651 }
1652 }
1653
1654 /**
1655 * Emit a gen6 IF statement with the comparison folded into the IF
1656 * instruction.
1657 */
1658 void
1659 fs_visitor::emit_if_gen6(ir_if *ir)
1660 {
1661 ir_expression *expr = ir->condition->as_expression();
1662
1663 if (expr) {
1664 fs_reg op[2];
1665 fs_inst *inst;
1666 fs_reg temp;
1667
1668 assert(expr->get_num_operands() <= 2);
1669 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1670 assert(expr->operands[i]->type->is_scalar());
1671
1672 expr->operands[i]->accept(this);
1673 op[i] = this->result;
1674 }
1675
1676 switch (expr->operation) {
1677 case ir_unop_logic_not:
1678 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1679 inst->conditional_mod = BRW_CONDITIONAL_Z;
1680 return;
1681
1682 case ir_binop_logic_xor:
1683 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1684 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1685 return;
1686
1687 case ir_binop_logic_or:
1688 temp = fs_reg(this, glsl_type::bool_type);
1689 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1690 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1691 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1692 return;
1693
1694 case ir_binop_logic_and:
1695 temp = fs_reg(this, glsl_type::bool_type);
1696 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1697 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1698 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1699 return;
1700
1701 case ir_unop_f2b:
1702 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1703 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1704 return;
1705
1706 case ir_unop_i2b:
1707 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1708 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1709 return;
1710
1711 case ir_binop_greater:
1712 case ir_binop_gequal:
1713 case ir_binop_less:
1714 case ir_binop_lequal:
1715 case ir_binop_equal:
1716 case ir_binop_all_equal:
1717 case ir_binop_nequal:
1718 case ir_binop_any_nequal:
1719 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1720 inst->conditional_mod =
1721 brw_conditional_for_comparison(expr->operation);
1722 return;
1723 default:
1724 assert(!"not reached");
1725 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1726 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1727 this->fail = true;
1728 return;
1729 }
1730 return;
1731 }
1732
1733 ir->condition->accept(this);
1734
1735 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1736 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1737 }
1738
1739 void
1740 fs_visitor::visit(ir_if *ir)
1741 {
1742 fs_inst *inst;
1743
1744 /* Don't point the annotation at the if statement, because then it plus
1745 * the then and else blocks get printed.
1746 */
1747 this->base_ir = ir->condition;
1748
1749 if (intel->gen >= 6) {
1750 emit_if_gen6(ir);
1751 } else {
1752 emit_bool_to_cond_code(ir->condition);
1753
1754 inst = emit(BRW_OPCODE_IF);
1755 inst->predicated = true;
1756 }
1757
1758 foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1759 ir_instruction *ir = (ir_instruction *)iter.get();
1760 this->base_ir = ir;
1761
1762 ir->accept(this);
1763 }
1764
1765 if (!ir->else_instructions.is_empty()) {
1766 emit(BRW_OPCODE_ELSE);
1767
1768 foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1769 ir_instruction *ir = (ir_instruction *)iter.get();
1770 this->base_ir = ir;
1771
1772 ir->accept(this);
1773 }
1774 }
1775
1776 emit(BRW_OPCODE_ENDIF);
1777 }
1778
1779 void
1780 fs_visitor::visit(ir_loop *ir)
1781 {
1782 fs_reg counter = reg_undef;
1783
1784 if (ir->counter) {
1785 this->base_ir = ir->counter;
1786 ir->counter->accept(this);
1787 counter = *(variable_storage(ir->counter));
1788
1789 if (ir->from) {
1790 this->base_ir = ir->from;
1791 ir->from->accept(this);
1792
1793 emit(BRW_OPCODE_MOV, counter, this->result);
1794 }
1795 }
1796
1797 emit(BRW_OPCODE_DO);
1798
1799 if (ir->to) {
1800 this->base_ir = ir->to;
1801 ir->to->accept(this);
1802
1803 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1804 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1805
1806 inst = emit(BRW_OPCODE_BREAK);
1807 inst->predicated = true;
1808 }
1809
1810 foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1811 ir_instruction *ir = (ir_instruction *)iter.get();
1812
1813 this->base_ir = ir;
1814 ir->accept(this);
1815 }
1816
1817 if (ir->increment) {
1818 this->base_ir = ir->increment;
1819 ir->increment->accept(this);
1820 emit(BRW_OPCODE_ADD, counter, counter, this->result);
1821 }
1822
1823 emit(BRW_OPCODE_WHILE);
1824 }
1825
1826 void
1827 fs_visitor::visit(ir_loop_jump *ir)
1828 {
1829 switch (ir->mode) {
1830 case ir_loop_jump::jump_break:
1831 emit(BRW_OPCODE_BREAK);
1832 break;
1833 case ir_loop_jump::jump_continue:
1834 emit(BRW_OPCODE_CONTINUE);
1835 break;
1836 }
1837 }
1838
1839 void
1840 fs_visitor::visit(ir_call *ir)
1841 {
1842 assert(!"FINISHME");
1843 }
1844
1845 void
1846 fs_visitor::visit(ir_return *ir)
1847 {
1848 assert(!"FINISHME");
1849 }
1850
1851 void
1852 fs_visitor::visit(ir_function *ir)
1853 {
1854 /* Ignore function bodies other than main() -- we shouldn't see calls to
1855 * them since they should all be inlined before we get to ir_to_mesa.
1856 */
1857 if (strcmp(ir->name, "main") == 0) {
1858 const ir_function_signature *sig;
1859 exec_list empty;
1860
1861 sig = ir->matching_signature(&empty);
1862
1863 assert(sig);
1864
1865 foreach_iter(exec_list_iterator, iter, sig->body) {
1866 ir_instruction *ir = (ir_instruction *)iter.get();
1867 this->base_ir = ir;
1868
1869 ir->accept(this);
1870 }
1871 }
1872 }
1873
1874 void
1875 fs_visitor::visit(ir_function_signature *ir)
1876 {
1877 assert(!"not reached");
1878 (void)ir;
1879 }
1880
1881 fs_inst *
1882 fs_visitor::emit(fs_inst inst)
1883 {
1884 fs_inst *list_inst = new(mem_ctx) fs_inst;
1885 *list_inst = inst;
1886
1887 list_inst->annotation = this->current_annotation;
1888 list_inst->ir = this->base_ir;
1889
1890 this->instructions.push_tail(list_inst);
1891
1892 return list_inst;
1893 }
1894
1895 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1896 void
1897 fs_visitor::emit_dummy_fs()
1898 {
1899 /* Everyone's favorite color. */
1900 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1901 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1902 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1903 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1904
1905 fs_inst *write;
1906 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1907 write->base_mrf = 0;
1908 }
1909
1910 /* The register location here is relative to the start of the URB
1911 * data. It will get adjusted to be a real location before
1912 * generate_code() time.
1913 */
1914 struct brw_reg
1915 fs_visitor::interp_reg(int location, int channel)
1916 {
1917 int regnr = urb_setup[location] * 2 + channel / 2;
1918 int stride = (channel & 1) * 4;
1919
1920 assert(urb_setup[location] != -1);
1921
1922 return brw_vec1_grf(regnr, stride);
1923 }
1924
1925 /** Emits the interpolation for the varying inputs. */
1926 void
1927 fs_visitor::emit_interpolation_setup_gen4()
1928 {
1929 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1930
1931 this->current_annotation = "compute pixel centers";
1932 this->pixel_x = fs_reg(this, glsl_type::uint_type);
1933 this->pixel_y = fs_reg(this, glsl_type::uint_type);
1934 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1935 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1936 emit(BRW_OPCODE_ADD,
1937 this->pixel_x,
1938 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1939 fs_reg(brw_imm_v(0x10101010)));
1940 emit(BRW_OPCODE_ADD,
1941 this->pixel_y,
1942 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1943 fs_reg(brw_imm_v(0x11001100)));
1944
1945 this->current_annotation = "compute pixel deltas from v0";
1946 if (brw->has_pln) {
1947 this->delta_x = fs_reg(this, glsl_type::vec2_type);
1948 this->delta_y = this->delta_x;
1949 this->delta_y.reg_offset++;
1950 } else {
1951 this->delta_x = fs_reg(this, glsl_type::float_type);
1952 this->delta_y = fs_reg(this, glsl_type::float_type);
1953 }
1954 emit(BRW_OPCODE_ADD, this->delta_x,
1955 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1956 emit(BRW_OPCODE_ADD, this->delta_y,
1957 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1958
1959 this->current_annotation = "compute pos.w and 1/pos.w";
1960 /* Compute wpos.w. It's always in our setup, since it's needed to
1961 * interpolate the other attributes.
1962 */
1963 this->wpos_w = fs_reg(this, glsl_type::float_type);
1964 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1965 interp_reg(FRAG_ATTRIB_WPOS, 3));
1966 /* Compute the pixel 1/W value from wpos.w. */
1967 this->pixel_w = fs_reg(this, glsl_type::float_type);
1968 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1969 this->current_annotation = NULL;
1970 }
1971
1972 /** Emits the interpolation for the varying inputs. */
1973 void
1974 fs_visitor::emit_interpolation_setup_gen6()
1975 {
1976 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1977
1978 /* If the pixel centers end up used, the setup is the same as for gen4. */
1979 this->current_annotation = "compute pixel centers";
1980 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1981 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1982 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1983 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1984 emit(BRW_OPCODE_ADD,
1985 int_pixel_x,
1986 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1987 fs_reg(brw_imm_v(0x10101010)));
1988 emit(BRW_OPCODE_ADD,
1989 int_pixel_y,
1990 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1991 fs_reg(brw_imm_v(0x11001100)));
1992
1993 /* As of gen6, we can no longer mix float and int sources. We have
1994 * to turn the integer pixel centers into floats for their actual
1995 * use.
1996 */
1997 this->pixel_x = fs_reg(this, glsl_type::float_type);
1998 this->pixel_y = fs_reg(this, glsl_type::float_type);
1999 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2000 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2001
2002 this->current_annotation = "compute 1/pos.w";
2003 this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2004 this->pixel_w = fs_reg(this, glsl_type::float_type);
2005 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2006
2007 this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2008 this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2009
2010 this->current_annotation = NULL;
2011 }
2012
2013 void
2014 fs_visitor::emit_fb_writes()
2015 {
2016 this->current_annotation = "FB write header";
2017 GLboolean header_present = GL_TRUE;
2018 int nr = 0;
2019
2020 if (intel->gen >= 6 &&
2021 !this->kill_emitted &&
2022 c->key.nr_color_regions == 1) {
2023 header_present = false;
2024 }
2025
2026 if (header_present) {
2027 /* m0, m1 header */
2028 nr += 2;
2029 }
2030
2031 if (c->aa_dest_stencil_reg) {
2032 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2033 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2034 }
2035
2036 /* Reserve space for color. It'll be filled in per MRT below. */
2037 int color_mrf = nr;
2038 nr += 4;
2039
2040 if (c->source_depth_to_render_target) {
2041 if (c->computes_depth) {
2042 /* Hand over gl_FragDepth. */
2043 assert(this->frag_depth);
2044 fs_reg depth = *(variable_storage(this->frag_depth));
2045
2046 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
2047 } else {
2048 /* Pass through the payload depth. */
2049 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2050 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2051 }
2052 }
2053
2054 if (c->dest_depth_reg) {
2055 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2056 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2057 }
2058
2059 fs_reg color = reg_undef;
2060 if (this->frag_color)
2061 color = *(variable_storage(this->frag_color));
2062 else if (this->frag_data) {
2063 color = *(variable_storage(this->frag_data));
2064 color.type = BRW_REGISTER_TYPE_F;
2065 }
2066
2067 for (int target = 0; target < c->key.nr_color_regions; target++) {
2068 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2069 "FB write target %d",
2070 target);
2071 if (this->frag_color || this->frag_data) {
2072 for (int i = 0; i < 4; i++) {
2073 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
2074 color.reg_offset++;
2075 }
2076 }
2077
2078 if (this->frag_color)
2079 color.reg_offset -= 4;
2080
2081 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2082 inst->target = target;
2083 inst->base_mrf = 0;
2084 inst->mlen = nr;
2085 if (target == c->key.nr_color_regions - 1)
2086 inst->eot = true;
2087 inst->header_present = header_present;
2088 }
2089
2090 if (c->key.nr_color_regions == 0) {
2091 if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2092 /* If the alpha test is enabled but there's no color buffer,
2093 * we still need to send alpha out the pipeline to our null
2094 * renderbuffer.
2095 */
2096 color.reg_offset += 3;
2097 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
2098 }
2099
2100 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2101 inst->base_mrf = 0;
2102 inst->mlen = nr;
2103 inst->eot = true;
2104 inst->header_present = header_present;
2105 }
2106
2107 this->current_annotation = NULL;
2108 }
2109
2110 void
2111 fs_visitor::generate_fb_write(fs_inst *inst)
2112 {
2113 GLboolean eot = inst->eot;
2114 struct brw_reg implied_header;
2115
2116 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2117 * move, here's g1.
2118 */
2119 brw_push_insn_state(p);
2120 brw_set_mask_control(p, BRW_MASK_DISABLE);
2121 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2122
2123 if (inst->header_present) {
2124 if (intel->gen >= 6) {
2125 brw_MOV(p,
2126 brw_message_reg(inst->base_mrf),
2127 brw_vec8_grf(0, 0));
2128
2129 if (inst->target > 0) {
2130 /* Set the render target index for choosing BLEND_STATE. */
2131 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2132 BRW_REGISTER_TYPE_UD),
2133 brw_imm_ud(inst->target));
2134 }
2135
2136 /* Clear viewport index, render target array index. */
2137 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2138 BRW_REGISTER_TYPE_UD),
2139 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2140 brw_imm_ud(0xf7ff));
2141
2142 implied_header = brw_null_reg();
2143 } else {
2144 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2145 }
2146
2147 brw_MOV(p,
2148 brw_message_reg(inst->base_mrf + 1),
2149 brw_vec8_grf(1, 0));
2150 } else {
2151 implied_header = brw_null_reg();
2152 }
2153
2154 brw_pop_insn_state(p);
2155
2156 brw_fb_WRITE(p,
2157 8, /* dispatch_width */
2158 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2159 inst->base_mrf,
2160 implied_header,
2161 inst->target,
2162 inst->mlen,
2163 0,
2164 eot,
2165 inst->header_present);
2166 }
2167
2168 void
2169 fs_visitor::generate_linterp(fs_inst *inst,
2170 struct brw_reg dst, struct brw_reg *src)
2171 {
2172 struct brw_reg delta_x = src[0];
2173 struct brw_reg delta_y = src[1];
2174 struct brw_reg interp = src[2];
2175
2176 if (brw->has_pln &&
2177 delta_y.nr == delta_x.nr + 1 &&
2178 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2179 brw_PLN(p, dst, interp, delta_x);
2180 } else {
2181 brw_LINE(p, brw_null_reg(), interp, delta_x);
2182 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2183 }
2184 }
2185
2186 void
2187 fs_visitor::generate_math(fs_inst *inst,
2188 struct brw_reg dst, struct brw_reg *src)
2189 {
2190 int op;
2191
2192 switch (inst->opcode) {
2193 case FS_OPCODE_RCP:
2194 op = BRW_MATH_FUNCTION_INV;
2195 break;
2196 case FS_OPCODE_RSQ:
2197 op = BRW_MATH_FUNCTION_RSQ;
2198 break;
2199 case FS_OPCODE_SQRT:
2200 op = BRW_MATH_FUNCTION_SQRT;
2201 break;
2202 case FS_OPCODE_EXP2:
2203 op = BRW_MATH_FUNCTION_EXP;
2204 break;
2205 case FS_OPCODE_LOG2:
2206 op = BRW_MATH_FUNCTION_LOG;
2207 break;
2208 case FS_OPCODE_POW:
2209 op = BRW_MATH_FUNCTION_POW;
2210 break;
2211 case FS_OPCODE_SIN:
2212 op = BRW_MATH_FUNCTION_SIN;
2213 break;
2214 case FS_OPCODE_COS:
2215 op = BRW_MATH_FUNCTION_COS;
2216 break;
2217 default:
2218 assert(!"not reached: unknown math function");
2219 op = 0;
2220 break;
2221 }
2222
2223 if (intel->gen >= 6) {
2224 assert(inst->mlen == 0);
2225
2226 if (inst->opcode == FS_OPCODE_POW) {
2227 brw_math2(p, dst, op, src[0], src[1]);
2228 } else {
2229 brw_math(p, dst,
2230 op,
2231 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2232 BRW_MATH_SATURATE_NONE,
2233 0, src[0],
2234 BRW_MATH_DATA_VECTOR,
2235 BRW_MATH_PRECISION_FULL);
2236 }
2237 } else {
2238 assert(inst->mlen >= 1);
2239
2240 brw_math(p, dst,
2241 op,
2242 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2243 BRW_MATH_SATURATE_NONE,
2244 inst->base_mrf, src[0],
2245 BRW_MATH_DATA_VECTOR,
2246 BRW_MATH_PRECISION_FULL);
2247 }
2248 }
2249
2250 void
2251 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2252 {
2253 int msg_type = -1;
2254 int rlen = 4;
2255 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2256
2257 if (intel->gen >= 5) {
2258 switch (inst->opcode) {
2259 case FS_OPCODE_TEX:
2260 if (inst->shadow_compare) {
2261 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2262 } else {
2263 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2264 }
2265 break;
2266 case FS_OPCODE_TXB:
2267 if (inst->shadow_compare) {
2268 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2269 } else {
2270 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2271 }
2272 break;
2273 case FS_OPCODE_TXL:
2274 if (inst->shadow_compare) {
2275 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2276 } else {
2277 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2278 }
2279 break;
2280 case FS_OPCODE_TXD:
2281 assert(!"TXD isn't supported on gen5+ yet.");
2282 break;
2283 }
2284 } else {
2285 switch (inst->opcode) {
2286 case FS_OPCODE_TEX:
2287 /* Note that G45 and older determines shadow compare and dispatch width
2288 * from message length for most messages.
2289 */
2290 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2291 if (inst->shadow_compare) {
2292 assert(inst->mlen == 6);
2293 } else {
2294 assert(inst->mlen <= 4);
2295 }
2296 break;
2297 case FS_OPCODE_TXB:
2298 if (inst->shadow_compare) {
2299 assert(inst->mlen == 6);
2300 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2301 } else {
2302 assert(inst->mlen == 9);
2303 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2304 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2305 }
2306 break;
2307 case FS_OPCODE_TXL:
2308 if (inst->shadow_compare) {
2309 assert(inst->mlen == 6);
2310 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2311 } else {
2312 assert(inst->mlen == 9);
2313 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2314 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2315 }
2316 break;
2317 case FS_OPCODE_TXD:
2318 assert(!"TXD isn't supported on gen4 yet.");
2319 break;
2320 }
2321 }
2322 assert(msg_type != -1);
2323
2324 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2325 rlen = 8;
2326 dst = vec16(dst);
2327 }
2328
2329 brw_SAMPLE(p,
2330 retype(dst, BRW_REGISTER_TYPE_UW),
2331 inst->base_mrf,
2332 src,
2333 SURF_INDEX_TEXTURE(inst->sampler),
2334 inst->sampler,
2335 WRITEMASK_XYZW,
2336 msg_type,
2337 rlen,
2338 inst->mlen,
2339 0,
2340 1,
2341 simd_mode);
2342 }
2343
2344
2345 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2346 * looking like:
2347 *
2348 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2349 *
2350 * and we're trying to produce:
2351 *
2352 * DDX DDY
2353 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
2354 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
2355 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
2356 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
2357 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
2358 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
2359 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
2360 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
2361 *
2362 * and add another set of two more subspans if in 16-pixel dispatch mode.
2363 *
2364 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2365 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2366 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2367 * between each other. We could probably do it like ddx and swizzle the right
2368 * order later, but bail for now and just produce
2369 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2370 */
2371 void
2372 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2373 {
2374 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2375 BRW_REGISTER_TYPE_F,
2376 BRW_VERTICAL_STRIDE_2,
2377 BRW_WIDTH_2,
2378 BRW_HORIZONTAL_STRIDE_0,
2379 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2380 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2381 BRW_REGISTER_TYPE_F,
2382 BRW_VERTICAL_STRIDE_2,
2383 BRW_WIDTH_2,
2384 BRW_HORIZONTAL_STRIDE_0,
2385 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2386 brw_ADD(p, dst, src0, negate(src1));
2387 }
2388
2389 void
2390 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2391 {
2392 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2393 BRW_REGISTER_TYPE_F,
2394 BRW_VERTICAL_STRIDE_4,
2395 BRW_WIDTH_4,
2396 BRW_HORIZONTAL_STRIDE_0,
2397 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2398 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2399 BRW_REGISTER_TYPE_F,
2400 BRW_VERTICAL_STRIDE_4,
2401 BRW_WIDTH_4,
2402 BRW_HORIZONTAL_STRIDE_0,
2403 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2404 brw_ADD(p, dst, src0, negate(src1));
2405 }
2406
2407 void
2408 fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2409 {
2410 if (intel->gen >= 6) {
2411 /* Gen6 no longer has the mask reg for us to just read the
2412 * active channels from. However, cmp updates just the channels
2413 * of the flag reg that are enabled, so we can get at the
2414 * channel enables that way. In this step, make a reg of ones
2415 * we'll compare to.
2416 */
2417 brw_MOV(p, mask, brw_imm_ud(1));
2418 } else {
2419 brw_push_insn_state(p);
2420 brw_set_mask_control(p, BRW_MASK_DISABLE);
2421 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2422 brw_pop_insn_state(p);
2423 }
2424 }
2425
2426 void
2427 fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2428 {
2429 if (intel->gen >= 6) {
2430 struct brw_reg f0 = brw_flag_reg();
2431 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2432
2433 brw_push_insn_state(p);
2434 brw_set_mask_control(p, BRW_MASK_DISABLE);
2435 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2436 brw_pop_insn_state(p);
2437
2438 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2439 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2440 /* Undo CMP's whacking of predication*/
2441 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2442
2443 brw_push_insn_state(p);
2444 brw_set_mask_control(p, BRW_MASK_DISABLE);
2445 brw_AND(p, g1, f0, g1);
2446 brw_pop_insn_state(p);
2447 } else {
2448 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2449
2450 mask = brw_uw1_reg(mask.file, mask.nr, 0);
2451
2452 brw_push_insn_state(p);
2453 brw_set_mask_control(p, BRW_MASK_DISABLE);
2454 brw_AND(p, g0, mask, g0);
2455 brw_pop_insn_state(p);
2456 }
2457 }
2458
2459 void
2460 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2461 {
2462 assert(inst->mlen != 0);
2463
2464 brw_MOV(p,
2465 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2466 retype(src, BRW_REGISTER_TYPE_UD));
2467 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2468 inst->offset);
2469 }
2470
2471 void
2472 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2473 {
2474 assert(inst->mlen != 0);
2475
2476 /* Clear any post destination dependencies that would be ignored by
2477 * the block read. See the B-Spec for pre-gen5 send instruction.
2478 *
2479 * This could use a better solution, since texture sampling and
2480 * math reads could potentially run into it as well -- anywhere
2481 * that we have a SEND with a destination that is a register that
2482 * was written but not read within the last N instructions (what's
2483 * N? unsure). This is rare because of dead code elimination, but
2484 * not impossible.
2485 */
2486 if (intel->gen == 4 && !intel->is_g4x)
2487 brw_MOV(p, brw_null_reg(), dst);
2488
2489 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2490 inst->offset);
2491
2492 if (intel->gen == 4 && !intel->is_g4x) {
2493 /* gen4 errata: destination from a send can't be used as a
2494 * destination until it's been read. Just read it so we don't
2495 * have to worry.
2496 */
2497 brw_MOV(p, brw_null_reg(), dst);
2498 }
2499 }
2500
2501
2502 void
2503 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2504 {
2505 assert(inst->mlen != 0);
2506
2507 /* Clear any post destination dependencies that would be ignored by
2508 * the block read. See the B-Spec for pre-gen5 send instruction.
2509 *
2510 * This could use a better solution, since texture sampling and
2511 * math reads could potentially run into it as well -- anywhere
2512 * that we have a SEND with a destination that is a register that
2513 * was written but not read within the last N instructions (what's
2514 * N? unsure). This is rare because of dead code elimination, but
2515 * not impossible.
2516 */
2517 if (intel->gen == 4 && !intel->is_g4x)
2518 brw_MOV(p, brw_null_reg(), dst);
2519
2520 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2521 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2522
2523 if (intel->gen == 4 && !intel->is_g4x) {
2524 /* gen4 errata: destination from a send can't be used as a
2525 * destination until it's been read. Just read it so we don't
2526 * have to worry.
2527 */
2528 brw_MOV(p, brw_null_reg(), dst);
2529 }
2530 }
2531
2532 /**
2533 * To be called after the last _mesa_add_state_reference() call, to
2534 * set up prog_data.param[] for assign_curb_setup() and
2535 * setup_pull_constants().
2536 */
2537 void
2538 fs_visitor::setup_paramvalues_refs()
2539 {
2540 /* Set up the pointers to ParamValues now that that array is finalized. */
2541 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2542 c->prog_data.param[i] =
2543 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2544 this->param_offset[i];
2545 }
2546 }
2547
2548 void
2549 fs_visitor::assign_curb_setup()
2550 {
2551 c->prog_data.first_curbe_grf = c->nr_payload_regs;
2552 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2553
2554 /* Map the offsets in the UNIFORM file to fixed HW regs. */
2555 foreach_iter(exec_list_iterator, iter, this->instructions) {
2556 fs_inst *inst = (fs_inst *)iter.get();
2557
2558 for (unsigned int i = 0; i < 3; i++) {
2559 if (inst->src[i].file == UNIFORM) {
2560 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2561 struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2562 constant_nr / 8,
2563 constant_nr % 8);
2564
2565 inst->src[i].file = FIXED_HW_REG;
2566 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2567 }
2568 }
2569 }
2570 }
2571
2572 void
2573 fs_visitor::calculate_urb_setup()
2574 {
2575 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2576 urb_setup[i] = -1;
2577 }
2578
2579 int urb_next = 0;
2580 /* Figure out where each of the incoming setup attributes lands. */
2581 if (intel->gen >= 6) {
2582 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2583 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2584 urb_setup[i] = urb_next++;
2585 }
2586 }
2587 } else {
2588 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2589 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2590 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2591 int fp_index;
2592
2593 if (i >= VERT_RESULT_VAR0)
2594 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2595 else if (i <= VERT_RESULT_TEX7)
2596 fp_index = i;
2597 else
2598 fp_index = -1;
2599
2600 if (fp_index >= 0)
2601 urb_setup[fp_index] = urb_next++;
2602 }
2603 }
2604 }
2605
2606 /* Each attribute is 4 setup channels, each of which is half a reg. */
2607 c->prog_data.urb_read_length = urb_next * 2;
2608 }
2609
2610 void
2611 fs_visitor::assign_urb_setup()
2612 {
2613 int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2614
2615 /* Offset all the urb_setup[] index by the actual position of the
2616 * setup regs, now that the location of the constants has been chosen.
2617 */
2618 foreach_iter(exec_list_iterator, iter, this->instructions) {
2619 fs_inst *inst = (fs_inst *)iter.get();
2620
2621 if (inst->opcode == FS_OPCODE_LINTERP) {
2622 assert(inst->src[2].file == FIXED_HW_REG);
2623 inst->src[2].fixed_hw_reg.nr += urb_start;
2624 }
2625
2626 if (inst->opcode == FS_OPCODE_CINTERP) {
2627 assert(inst->src[0].file == FIXED_HW_REG);
2628 inst->src[0].fixed_hw_reg.nr += urb_start;
2629 }
2630 }
2631
2632 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2633 }
2634
2635 /**
2636 * Split large virtual GRFs into separate components if we can.
2637 *
2638 * This is mostly duplicated with what brw_fs_vector_splitting does,
2639 * but that's really conservative because it's afraid of doing
2640 * splitting that doesn't result in real progress after the rest of
2641 * the optimization phases, which would cause infinite looping in
2642 * optimization. We can do it once here, safely. This also has the
2643 * opportunity to split interpolated values, or maybe even uniforms,
2644 * which we don't have at the IR level.
2645 *
2646 * We want to split, because virtual GRFs are what we register
2647 * allocate and spill (due to contiguousness requirements for some
2648 * instructions), and they're what we naturally generate in the
2649 * codegen process, but most virtual GRFs don't actually need to be
2650 * contiguous sets of GRFs. If we split, we'll end up with reduced
2651 * live intervals and better dead code elimination and coalescing.
2652 */
2653 void
2654 fs_visitor::split_virtual_grfs()
2655 {
2656 int num_vars = this->virtual_grf_next;
2657 bool split_grf[num_vars];
2658 int new_virtual_grf[num_vars];
2659
2660 /* Try to split anything > 0 sized. */
2661 for (int i = 0; i < num_vars; i++) {
2662 if (this->virtual_grf_sizes[i] != 1)
2663 split_grf[i] = true;
2664 else
2665 split_grf[i] = false;
2666 }
2667
2668 if (brw->has_pln) {
2669 /* PLN opcodes rely on the delta_xy being contiguous. */
2670 split_grf[this->delta_x.reg] = false;
2671 }
2672
2673 foreach_iter(exec_list_iterator, iter, this->instructions) {
2674 fs_inst *inst = (fs_inst *)iter.get();
2675
2676 /* Texturing produces 4 contiguous registers, so no splitting. */
2677 if (inst->is_tex()) {
2678 split_grf[inst->dst.reg] = false;
2679 }
2680 }
2681
2682 /* Allocate new space for split regs. Note that the virtual
2683 * numbers will be contiguous.
2684 */
2685 for (int i = 0; i < num_vars; i++) {
2686 if (split_grf[i]) {
2687 new_virtual_grf[i] = virtual_grf_alloc(1);
2688 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2689 int reg = virtual_grf_alloc(1);
2690 assert(reg == new_virtual_grf[i] + j - 1);
2691 (void) reg;
2692 }
2693 this->virtual_grf_sizes[i] = 1;
2694 }
2695 }
2696
2697 foreach_iter(exec_list_iterator, iter, this->instructions) {
2698 fs_inst *inst = (fs_inst *)iter.get();
2699
2700 if (inst->dst.file == GRF &&
2701 split_grf[inst->dst.reg] &&
2702 inst->dst.reg_offset != 0) {
2703 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2704 inst->dst.reg_offset - 1);
2705 inst->dst.reg_offset = 0;
2706 }
2707 for (int i = 0; i < 3; i++) {
2708 if (inst->src[i].file == GRF &&
2709 split_grf[inst->src[i].reg] &&
2710 inst->src[i].reg_offset != 0) {
2711 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2712 inst->src[i].reg_offset - 1);
2713 inst->src[i].reg_offset = 0;
2714 }
2715 }
2716 }
2717 this->live_intervals_valid = false;
2718 }
2719
2720 /**
2721 * Choose accesses from the UNIFORM file to demote to using the pull
2722 * constant buffer.
2723 *
2724 * We allow a fragment shader to have more than the specified minimum
2725 * maximum number of fragment shader uniform components (64). If
2726 * there are too many of these, they'd fill up all of register space.
2727 * So, this will push some of them out to the pull constant buffer and
2728 * update the program to load them.
2729 */
2730 void
2731 fs_visitor::setup_pull_constants()
2732 {
2733 /* Only allow 16 registers (128 uniform components) as push constants. */
2734 unsigned int max_uniform_components = 16 * 8;
2735 if (c->prog_data.nr_params <= max_uniform_components)
2736 return;
2737
2738 /* Just demote the end of the list. We could probably do better
2739 * here, demoting things that are rarely used in the program first.
2740 */
2741 int pull_uniform_base = max_uniform_components;
2742 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2743
2744 foreach_iter(exec_list_iterator, iter, this->instructions) {
2745 fs_inst *inst = (fs_inst *)iter.get();
2746
2747 for (int i = 0; i < 3; i++) {
2748 if (inst->src[i].file != UNIFORM)
2749 continue;
2750
2751 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2752 if (uniform_nr < pull_uniform_base)
2753 continue;
2754
2755 fs_reg dst = fs_reg(this, glsl_type::float_type);
2756 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2757 dst);
2758 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2759 pull->ir = inst->ir;
2760 pull->annotation = inst->annotation;
2761 pull->base_mrf = 14;
2762 pull->mlen = 1;
2763
2764 inst->insert_before(pull);
2765
2766 inst->src[i].file = GRF;
2767 inst->src[i].reg = dst.reg;
2768 inst->src[i].reg_offset = 0;
2769 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2770 }
2771 }
2772
2773 for (int i = 0; i < pull_uniform_count; i++) {
2774 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2775 c->prog_data.pull_param_convert[i] =
2776 c->prog_data.param_convert[pull_uniform_base + i];
2777 }
2778 c->prog_data.nr_params -= pull_uniform_count;
2779 c->prog_data.nr_pull_params = pull_uniform_count;
2780 }
2781
2782 void
2783 fs_visitor::calculate_live_intervals()
2784 {
2785 int num_vars = this->virtual_grf_next;
2786 int *def = ralloc_array(mem_ctx, int, num_vars);
2787 int *use = ralloc_array(mem_ctx, int, num_vars);
2788 int loop_depth = 0;
2789 int loop_start = 0;
2790 int bb_header_ip = 0;
2791
2792 if (this->live_intervals_valid)
2793 return;
2794
2795 for (int i = 0; i < num_vars; i++) {
2796 def[i] = MAX_INSTRUCTION;
2797 use[i] = -1;
2798 }
2799
2800 int ip = 0;
2801 foreach_iter(exec_list_iterator, iter, this->instructions) {
2802 fs_inst *inst = (fs_inst *)iter.get();
2803
2804 if (inst->opcode == BRW_OPCODE_DO) {
2805 if (loop_depth++ == 0)
2806 loop_start = ip;
2807 } else if (inst->opcode == BRW_OPCODE_WHILE) {
2808 loop_depth--;
2809
2810 if (loop_depth == 0) {
2811 /* Patches up the use of vars marked for being live across
2812 * the whole loop.
2813 */
2814 for (int i = 0; i < num_vars; i++) {
2815 if (use[i] == loop_start) {
2816 use[i] = ip;
2817 }
2818 }
2819 }
2820 } else {
2821 for (unsigned int i = 0; i < 3; i++) {
2822 if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2823 int reg = inst->src[i].reg;
2824
2825 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2826 def[reg] >= bb_header_ip)) {
2827 use[reg] = ip;
2828 } else {
2829 def[reg] = MIN2(loop_start, def[reg]);
2830 use[reg] = loop_start;
2831
2832 /* Nobody else is going to go smash our start to
2833 * later in the loop now, because def[reg] now
2834 * points before the bb header.
2835 */
2836 }
2837 }
2838 }
2839 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2840 int reg = inst->dst.reg;
2841
2842 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2843 !inst->predicated)) {
2844 def[reg] = MIN2(def[reg], ip);
2845 } else {
2846 def[reg] = MIN2(def[reg], loop_start);
2847 }
2848 }
2849 }
2850
2851 ip++;
2852
2853 /* Set the basic block header IP. This is used for determining
2854 * if a complete def of single-register virtual GRF in a loop
2855 * dominates a use in the same basic block. It's a quick way to
2856 * reduce the live interval range of most register used in a
2857 * loop.
2858 */
2859 if (inst->opcode == BRW_OPCODE_IF ||
2860 inst->opcode == BRW_OPCODE_ELSE ||
2861 inst->opcode == BRW_OPCODE_ENDIF ||
2862 inst->opcode == BRW_OPCODE_DO ||
2863 inst->opcode == BRW_OPCODE_WHILE ||
2864 inst->opcode == BRW_OPCODE_BREAK ||
2865 inst->opcode == BRW_OPCODE_CONTINUE) {
2866 bb_header_ip = ip;
2867 }
2868 }
2869
2870 ralloc_free(this->virtual_grf_def);
2871 ralloc_free(this->virtual_grf_use);
2872 this->virtual_grf_def = def;
2873 this->virtual_grf_use = use;
2874
2875 this->live_intervals_valid = true;
2876 }
2877
2878 /**
2879 * Attempts to move immediate constants into the immediate
2880 * constant slot of following instructions.
2881 *
2882 * Immediate constants are a bit tricky -- they have to be in the last
2883 * operand slot, you can't do abs/negate on them,
2884 */
2885
2886 bool
2887 fs_visitor::propagate_constants()
2888 {
2889 bool progress = false;
2890
2891 calculate_live_intervals();
2892
2893 foreach_iter(exec_list_iterator, iter, this->instructions) {
2894 fs_inst *inst = (fs_inst *)iter.get();
2895
2896 if (inst->opcode != BRW_OPCODE_MOV ||
2897 inst->predicated ||
2898 inst->dst.file != GRF || inst->src[0].file != IMM ||
2899 inst->dst.type != inst->src[0].type)
2900 continue;
2901
2902 /* Don't bother with cases where we should have had the
2903 * operation on the constant folded in GLSL already.
2904 */
2905 if (inst->saturate)
2906 continue;
2907
2908 /* Found a move of a constant to a GRF. Find anything else using the GRF
2909 * before it's written, and replace it with the constant if we can.
2910 */
2911 exec_list_iterator scan_iter = iter;
2912 scan_iter.next();
2913 for (; scan_iter.has_next(); scan_iter.next()) {
2914 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2915
2916 if (scan_inst->opcode == BRW_OPCODE_DO ||
2917 scan_inst->opcode == BRW_OPCODE_WHILE ||
2918 scan_inst->opcode == BRW_OPCODE_ELSE ||
2919 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2920 break;
2921 }
2922
2923 for (int i = 2; i >= 0; i--) {
2924 if (scan_inst->src[i].file != GRF ||
2925 scan_inst->src[i].reg != inst->dst.reg ||
2926 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2927 continue;
2928
2929 /* Don't bother with cases where we should have had the
2930 * operation on the constant folded in GLSL already.
2931 */
2932 if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2933 continue;
2934
2935 switch (scan_inst->opcode) {
2936 case BRW_OPCODE_MOV:
2937 scan_inst->src[i] = inst->src[0];
2938 progress = true;
2939 break;
2940
2941 case BRW_OPCODE_MUL:
2942 case BRW_OPCODE_ADD:
2943 if (i == 1) {
2944 scan_inst->src[i] = inst->src[0];
2945 progress = true;
2946 } else if (i == 0 && scan_inst->src[1].file != IMM) {
2947 /* Fit this constant in by commuting the operands */
2948 scan_inst->src[0] = scan_inst->src[1];
2949 scan_inst->src[1] = inst->src[0];
2950 progress = true;
2951 }
2952 break;
2953 case BRW_OPCODE_CMP:
2954 case BRW_OPCODE_SEL:
2955 if (i == 1) {
2956 scan_inst->src[i] = inst->src[0];
2957 progress = true;
2958 }
2959 }
2960 }
2961
2962 if (scan_inst->dst.file == GRF &&
2963 scan_inst->dst.reg == inst->dst.reg &&
2964 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2965 scan_inst->is_tex())) {
2966 break;
2967 }
2968 }
2969 }
2970
2971 if (progress)
2972 this->live_intervals_valid = false;
2973
2974 return progress;
2975 }
2976 /**
2977 * Must be called after calculate_live_intervales() to remove unused
2978 * writes to registers -- register allocation will fail otherwise
2979 * because something deffed but not used won't be considered to
2980 * interfere with other regs.
2981 */
2982 bool
2983 fs_visitor::dead_code_eliminate()
2984 {
2985 bool progress = false;
2986 int pc = 0;
2987
2988 calculate_live_intervals();
2989
2990 foreach_iter(exec_list_iterator, iter, this->instructions) {
2991 fs_inst *inst = (fs_inst *)iter.get();
2992
2993 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2994 inst->remove();
2995 progress = true;
2996 }
2997
2998 pc++;
2999 }
3000
3001 if (progress)
3002 live_intervals_valid = false;
3003
3004 return progress;
3005 }
3006
3007 bool
3008 fs_visitor::register_coalesce()
3009 {
3010 bool progress = false;
3011 int if_depth = 0;
3012 int loop_depth = 0;
3013
3014 foreach_iter(exec_list_iterator, iter, this->instructions) {
3015 fs_inst *inst = (fs_inst *)iter.get();
3016
3017 /* Make sure that we dominate the instructions we're going to
3018 * scan for interfering with our coalescing, or we won't have
3019 * scanned enough to see if anything interferes with our
3020 * coalescing. We don't dominate the following instructions if
3021 * we're in a loop or an if block.
3022 */
3023 switch (inst->opcode) {
3024 case BRW_OPCODE_DO:
3025 loop_depth++;
3026 break;
3027 case BRW_OPCODE_WHILE:
3028 loop_depth--;
3029 break;
3030 case BRW_OPCODE_IF:
3031 if_depth++;
3032 break;
3033 case BRW_OPCODE_ENDIF:
3034 if_depth--;
3035 break;
3036 }
3037 if (loop_depth || if_depth)
3038 continue;
3039
3040 if (inst->opcode != BRW_OPCODE_MOV ||
3041 inst->predicated ||
3042 inst->saturate ||
3043 inst->dst.file != GRF || inst->src[0].file != GRF ||
3044 inst->dst.type != inst->src[0].type)
3045 continue;
3046
3047 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3048
3049 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
3050 * them: check for no writes to either one until the exit of the
3051 * program.
3052 */
3053 bool interfered = false;
3054 exec_list_iterator scan_iter = iter;
3055 scan_iter.next();
3056 for (; scan_iter.has_next(); scan_iter.next()) {
3057 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3058
3059 if (scan_inst->dst.file == GRF) {
3060 if (scan_inst->dst.reg == inst->dst.reg &&
3061 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3062 scan_inst->is_tex())) {
3063 interfered = true;
3064 break;
3065 }
3066 if (scan_inst->dst.reg == inst->src[0].reg &&
3067 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3068 scan_inst->is_tex())) {
3069 interfered = true;
3070 break;
3071 }
3072 }
3073
3074 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3075 * coalescing those for now. We should do something more specific.
3076 */
3077 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3078 interfered = true;
3079 break;
3080 }
3081 }
3082 if (interfered) {
3083 continue;
3084 }
3085
3086 /* Rewrite the later usage to point at the source of the move to
3087 * be removed.
3088 */
3089 for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3090 scan_iter.next()) {
3091 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3092
3093 for (int i = 0; i < 3; i++) {
3094 if (scan_inst->src[i].file == GRF &&
3095 scan_inst->src[i].reg == inst->dst.reg &&
3096 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3097 scan_inst->src[i].reg = inst->src[0].reg;
3098 scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3099 scan_inst->src[i].abs |= inst->src[0].abs;
3100 scan_inst->src[i].negate ^= inst->src[0].negate;
3101 scan_inst->src[i].smear = inst->src[0].smear;
3102 }
3103 }
3104 }
3105
3106 inst->remove();
3107 progress = true;
3108 }
3109
3110 if (progress)
3111 live_intervals_valid = false;
3112
3113 return progress;
3114 }
3115
3116
3117 bool
3118 fs_visitor::compute_to_mrf()
3119 {
3120 bool progress = false;
3121 int next_ip = 0;
3122
3123 calculate_live_intervals();
3124
3125 foreach_iter(exec_list_iterator, iter, this->instructions) {
3126 fs_inst *inst = (fs_inst *)iter.get();
3127
3128 int ip = next_ip;
3129 next_ip++;
3130
3131 if (inst->opcode != BRW_OPCODE_MOV ||
3132 inst->predicated ||
3133 inst->dst.file != MRF || inst->src[0].file != GRF ||
3134 inst->dst.type != inst->src[0].type ||
3135 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3136 continue;
3137
3138 /* Can't compute-to-MRF this GRF if someone else was going to
3139 * read it later.
3140 */
3141 if (this->virtual_grf_use[inst->src[0].reg] > ip)
3142 continue;
3143
3144 /* Found a move of a GRF to a MRF. Let's see if we can go
3145 * rewrite the thing that made this GRF to write into the MRF.
3146 */
3147 fs_inst *scan_inst;
3148 for (scan_inst = (fs_inst *)inst->prev;
3149 scan_inst->prev != NULL;
3150 scan_inst = (fs_inst *)scan_inst->prev) {
3151 if (scan_inst->dst.file == GRF &&
3152 scan_inst->dst.reg == inst->src[0].reg) {
3153 /* Found the last thing to write our reg we want to turn
3154 * into a compute-to-MRF.
3155 */
3156
3157 if (scan_inst->is_tex()) {
3158 /* texturing writes several continuous regs, so we can't
3159 * compute-to-mrf that.
3160 */
3161 break;
3162 }
3163
3164 /* If it's predicated, it (probably) didn't populate all
3165 * the channels.
3166 */
3167 if (scan_inst->predicated)
3168 break;
3169
3170 /* SEND instructions can't have MRF as a destination. */
3171 if (scan_inst->mlen)
3172 break;
3173
3174 if (intel->gen >= 6) {
3175 /* gen6 math instructions must have the destination be
3176 * GRF, so no compute-to-MRF for them.
3177 */
3178 if (scan_inst->is_math()) {
3179 break;
3180 }
3181 }
3182
3183 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3184 /* Found the creator of our MRF's source value. */
3185 scan_inst->dst.file = MRF;
3186 scan_inst->dst.hw_reg = inst->dst.hw_reg;
3187 scan_inst->saturate |= inst->saturate;
3188 inst->remove();
3189 progress = true;
3190 }
3191 break;
3192 }
3193
3194 /* We don't handle flow control here. Most computation of
3195 * values that end up in MRFs are shortly before the MRF
3196 * write anyway.
3197 */
3198 if (scan_inst->opcode == BRW_OPCODE_DO ||
3199 scan_inst->opcode == BRW_OPCODE_WHILE ||
3200 scan_inst->opcode == BRW_OPCODE_ELSE ||
3201 scan_inst->opcode == BRW_OPCODE_ENDIF) {
3202 break;
3203 }
3204
3205 /* You can't read from an MRF, so if someone else reads our
3206 * MRF's source GRF that we wanted to rewrite, that stops us.
3207 */
3208 bool interfered = false;
3209 for (int i = 0; i < 3; i++) {
3210 if (scan_inst->src[i].file == GRF &&
3211 scan_inst->src[i].reg == inst->src[0].reg &&
3212 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3213 interfered = true;
3214 }
3215 }
3216 if (interfered)
3217 break;
3218
3219 if (scan_inst->dst.file == MRF &&
3220 scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3221 /* Somebody else wrote our MRF here, so we can't can't
3222 * compute-to-MRF before that.
3223 */
3224 break;
3225 }
3226
3227 if (scan_inst->mlen > 0) {
3228 /* Found a SEND instruction, which means that there are
3229 * live values in MRFs from base_mrf to base_mrf +
3230 * scan_inst->mlen - 1. Don't go pushing our MRF write up
3231 * above it.
3232 */
3233 if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3234 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3235 break;
3236 }
3237 }
3238 }
3239 }
3240
3241 return progress;
3242 }
3243
3244 /**
3245 * Walks through basic blocks, locking for repeated MRF writes and
3246 * removing the later ones.
3247 */
3248 bool
3249 fs_visitor::remove_duplicate_mrf_writes()
3250 {
3251 fs_inst *last_mrf_move[16];
3252 bool progress = false;
3253
3254 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3255
3256 foreach_iter(exec_list_iterator, iter, this->instructions) {
3257 fs_inst *inst = (fs_inst *)iter.get();
3258
3259 switch (inst->opcode) {
3260 case BRW_OPCODE_DO:
3261 case BRW_OPCODE_WHILE:
3262 case BRW_OPCODE_IF:
3263 case BRW_OPCODE_ELSE:
3264 case BRW_OPCODE_ENDIF:
3265 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3266 continue;
3267 default:
3268 break;
3269 }
3270
3271 if (inst->opcode == BRW_OPCODE_MOV &&
3272 inst->dst.file == MRF) {
3273 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3274 if (prev_inst && inst->equals(prev_inst)) {
3275 inst->remove();
3276 progress = true;
3277 continue;
3278 }
3279 }
3280
3281 /* Clear out the last-write records for MRFs that were overwritten. */
3282 if (inst->dst.file == MRF) {
3283 last_mrf_move[inst->dst.hw_reg] = NULL;
3284 }
3285
3286 if (inst->mlen > 0) {
3287 /* Found a SEND instruction, which will include two or fewer
3288 * implied MRF writes. We could do better here.
3289 */
3290 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3291 last_mrf_move[inst->base_mrf + i] = NULL;
3292 }
3293 }
3294
3295 /* Clear out any MRF move records whose sources got overwritten. */
3296 if (inst->dst.file == GRF) {
3297 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3298 if (last_mrf_move[i] &&
3299 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3300 last_mrf_move[i] = NULL;
3301 }
3302 }
3303 }
3304
3305 if (inst->opcode == BRW_OPCODE_MOV &&
3306 inst->dst.file == MRF &&
3307 inst->src[0].file == GRF &&
3308 !inst->predicated) {
3309 last_mrf_move[inst->dst.hw_reg] = inst;
3310 }
3311 }
3312
3313 return progress;
3314 }
3315
3316 bool
3317 fs_visitor::virtual_grf_interferes(int a, int b)
3318 {
3319 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3320 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3321
3322 /* We can't handle dead register writes here, without iterating
3323 * over the whole instruction stream to find every single dead
3324 * write to that register to compare to the live interval of the
3325 * other register. Just assert that dead_code_eliminate() has been
3326 * called.
3327 */
3328 assert((this->virtual_grf_use[a] != -1 ||
3329 this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3330 (this->virtual_grf_use[b] != -1 ||
3331 this->virtual_grf_def[b] == MAX_INSTRUCTION));
3332
3333 return start < end;
3334 }
3335
3336 static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3337 {
3338 struct brw_reg brw_reg;
3339
3340 switch (reg->file) {
3341 case GRF:
3342 case ARF:
3343 case MRF:
3344 if (reg->smear == -1) {
3345 brw_reg = brw_vec8_reg(reg->file,
3346 reg->hw_reg, 0);
3347 } else {
3348 brw_reg = brw_vec1_reg(reg->file,
3349 reg->hw_reg, reg->smear);
3350 }
3351 brw_reg = retype(brw_reg, reg->type);
3352 break;
3353 case IMM:
3354 switch (reg->type) {
3355 case BRW_REGISTER_TYPE_F:
3356 brw_reg = brw_imm_f(reg->imm.f);
3357 break;
3358 case BRW_REGISTER_TYPE_D:
3359 brw_reg = brw_imm_d(reg->imm.i);
3360 break;
3361 case BRW_REGISTER_TYPE_UD:
3362 brw_reg = brw_imm_ud(reg->imm.u);
3363 break;
3364 default:
3365 assert(!"not reached");
3366 brw_reg = brw_null_reg();
3367 break;
3368 }
3369 break;
3370 case FIXED_HW_REG:
3371 brw_reg = reg->fixed_hw_reg;
3372 break;
3373 case BAD_FILE:
3374 /* Probably unused. */
3375 brw_reg = brw_null_reg();
3376 break;
3377 case UNIFORM:
3378 assert(!"not reached");
3379 brw_reg = brw_null_reg();
3380 break;
3381 default:
3382 assert(!"not reached");
3383 brw_reg = brw_null_reg();
3384 break;
3385 }
3386 if (reg->abs)
3387 brw_reg = brw_abs(brw_reg);
3388 if (reg->negate)
3389 brw_reg = negate(brw_reg);
3390
3391 return brw_reg;
3392 }
3393
3394 void
3395 fs_visitor::generate_code()
3396 {
3397 int last_native_inst = 0;
3398 const char *last_annotation_string = NULL;
3399 ir_instruction *last_annotation_ir = NULL;
3400
3401 int if_stack_array_size = 16;
3402 int loop_stack_array_size = 16;
3403 int if_stack_depth = 0, loop_stack_depth = 0;
3404 brw_instruction **if_stack =
3405 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3406 brw_instruction **loop_stack =
3407 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3408 int *if_depth_in_loop =
3409 rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3410
3411
3412 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3413 printf("Native code for fragment shader %d:\n",
3414 ctx->Shader.CurrentFragmentProgram->Name);
3415 }
3416
3417 foreach_iter(exec_list_iterator, iter, this->instructions) {
3418 fs_inst *inst = (fs_inst *)iter.get();
3419 struct brw_reg src[3], dst;
3420
3421 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3422 if (last_annotation_ir != inst->ir) {
3423 last_annotation_ir = inst->ir;
3424 if (last_annotation_ir) {
3425 printf(" ");
3426 last_annotation_ir->print();
3427 printf("\n");
3428 }
3429 }
3430 if (last_annotation_string != inst->annotation) {
3431 last_annotation_string = inst->annotation;
3432 if (last_annotation_string)
3433 printf(" %s\n", last_annotation_string);
3434 }
3435 }
3436
3437 for (unsigned int i = 0; i < 3; i++) {
3438 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3439 }
3440 dst = brw_reg_from_fs_reg(&inst->dst);
3441
3442 brw_set_conditionalmod(p, inst->conditional_mod);
3443 brw_set_predicate_control(p, inst->predicated);
3444 brw_set_saturate(p, inst->saturate);
3445
3446 switch (inst->opcode) {
3447 case BRW_OPCODE_MOV:
3448 brw_MOV(p, dst, src[0]);
3449 break;
3450 case BRW_OPCODE_ADD:
3451 brw_ADD(p, dst, src[0], src[1]);
3452 break;
3453 case BRW_OPCODE_MUL:
3454 brw_MUL(p, dst, src[0], src[1]);
3455 break;
3456
3457 case BRW_OPCODE_FRC:
3458 brw_FRC(p, dst, src[0]);
3459 break;
3460 case BRW_OPCODE_RNDD:
3461 brw_RNDD(p, dst, src[0]);
3462 break;
3463 case BRW_OPCODE_RNDE:
3464 brw_RNDE(p, dst, src[0]);
3465 break;
3466 case BRW_OPCODE_RNDZ:
3467 brw_RNDZ(p, dst, src[0]);
3468 break;
3469
3470 case BRW_OPCODE_AND:
3471 brw_AND(p, dst, src[0], src[1]);
3472 break;
3473 case BRW_OPCODE_OR:
3474 brw_OR(p, dst, src[0], src[1]);
3475 break;
3476 case BRW_OPCODE_XOR:
3477 brw_XOR(p, dst, src[0], src[1]);
3478 break;
3479 case BRW_OPCODE_NOT:
3480 brw_NOT(p, dst, src[0]);
3481 break;
3482 case BRW_OPCODE_ASR:
3483 brw_ASR(p, dst, src[0], src[1]);
3484 break;
3485 case BRW_OPCODE_SHR:
3486 brw_SHR(p, dst, src[0], src[1]);
3487 break;
3488 case BRW_OPCODE_SHL:
3489 brw_SHL(p, dst, src[0], src[1]);
3490 break;
3491
3492 case BRW_OPCODE_CMP:
3493 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3494 break;
3495 case BRW_OPCODE_SEL:
3496 brw_SEL(p, dst, src[0], src[1]);
3497 break;
3498
3499 case BRW_OPCODE_IF:
3500 if (inst->src[0].file != BAD_FILE) {
3501 assert(intel->gen >= 6);
3502 if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3503 } else {
3504 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3505 }
3506 if_depth_in_loop[loop_stack_depth]++;
3507 if_stack_depth++;
3508 if (if_stack_array_size <= if_stack_depth) {
3509 if_stack_array_size *= 2;
3510 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3511 if_stack_array_size);
3512 }
3513 break;
3514
3515 case BRW_OPCODE_ELSE:
3516 if_stack[if_stack_depth - 1] =
3517 brw_ELSE(p, if_stack[if_stack_depth - 1]);
3518 break;
3519 case BRW_OPCODE_ENDIF:
3520 if_stack_depth--;
3521 brw_ENDIF(p , if_stack[if_stack_depth]);
3522 if_depth_in_loop[loop_stack_depth]--;
3523 break;
3524
3525 case BRW_OPCODE_DO:
3526 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3527 if (loop_stack_array_size <= loop_stack_depth) {
3528 loop_stack_array_size *= 2;
3529 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3530 loop_stack_array_size);
3531 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3532 loop_stack_array_size);
3533 }
3534 if_depth_in_loop[loop_stack_depth] = 0;
3535 break;
3536
3537 case BRW_OPCODE_BREAK:
3538 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3539 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3540 break;
3541 case BRW_OPCODE_CONTINUE:
3542 /* FINISHME: We need to write the loop instruction support still. */
3543 if (intel->gen >= 6)
3544 gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3545 else
3546 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3547 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3548 break;
3549
3550 case BRW_OPCODE_WHILE: {
3551 struct brw_instruction *inst0, *inst1;
3552 GLuint br = 1;
3553
3554 if (intel->gen >= 5)
3555 br = 2;
3556
3557 assert(loop_stack_depth > 0);
3558 loop_stack_depth--;
3559 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3560 if (intel->gen < 6) {
3561 /* patch all the BREAK/CONT instructions from last BGNLOOP */
3562 while (inst0 > loop_stack[loop_stack_depth]) {
3563 inst0--;
3564 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3565 inst0->bits3.if_else.jump_count == 0) {
3566 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3567 }
3568 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3569 inst0->bits3.if_else.jump_count == 0) {
3570 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3571 }
3572 }
3573 }
3574 }
3575 break;
3576
3577 case FS_OPCODE_RCP:
3578 case FS_OPCODE_RSQ:
3579 case FS_OPCODE_SQRT:
3580 case FS_OPCODE_EXP2:
3581 case FS_OPCODE_LOG2:
3582 case FS_OPCODE_POW:
3583 case FS_OPCODE_SIN:
3584 case FS_OPCODE_COS:
3585 generate_math(inst, dst, src);
3586 break;
3587 case FS_OPCODE_CINTERP:
3588 brw_MOV(p, dst, src[0]);
3589 break;
3590 case FS_OPCODE_LINTERP:
3591 generate_linterp(inst, dst, src);
3592 break;
3593 case FS_OPCODE_TEX:
3594 case FS_OPCODE_TXB:
3595 case FS_OPCODE_TXD:
3596 case FS_OPCODE_TXL:
3597 generate_tex(inst, dst, src[0]);
3598 break;
3599 case FS_OPCODE_DISCARD_NOT:
3600 generate_discard_not(inst, dst);
3601 break;
3602 case FS_OPCODE_DISCARD_AND:
3603 generate_discard_and(inst, src[0]);
3604 break;
3605 case FS_OPCODE_DDX:
3606 generate_ddx(inst, dst, src[0]);
3607 break;
3608 case FS_OPCODE_DDY:
3609 generate_ddy(inst, dst, src[0]);
3610 break;
3611
3612 case FS_OPCODE_SPILL:
3613 generate_spill(inst, src[0]);
3614 break;
3615
3616 case FS_OPCODE_UNSPILL:
3617 generate_unspill(inst, dst);
3618 break;
3619
3620 case FS_OPCODE_PULL_CONSTANT_LOAD:
3621 generate_pull_constant_load(inst, dst);
3622 break;
3623
3624 case FS_OPCODE_FB_WRITE:
3625 generate_fb_write(inst);
3626 break;
3627 default:
3628 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3629 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3630 brw_opcodes[inst->opcode].name);
3631 } else {
3632 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3633 }
3634 this->fail = true;
3635 }
3636
3637 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3638 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3639 if (0) {
3640 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3641 ((uint32_t *)&p->store[i])[3],
3642 ((uint32_t *)&p->store[i])[2],
3643 ((uint32_t *)&p->store[i])[1],
3644 ((uint32_t *)&p->store[i])[0]);
3645 }
3646 brw_disasm(stdout, &p->store[i], intel->gen);
3647 }
3648 }
3649
3650 last_native_inst = p->nr_insn;
3651 }
3652
3653 ralloc_free(if_stack);
3654 ralloc_free(loop_stack);
3655 ralloc_free(if_depth_in_loop);
3656
3657 brw_set_uip_jip(p);
3658
3659 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3660 * emit issues, it doesn't get the jump distances into the output,
3661 * which is often something we want to debug. So this is here in
3662 * case you're doing that.
3663 */
3664 if (0) {
3665 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3666 for (unsigned int i = 0; i < p->nr_insn; i++) {
3667 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3668 ((uint32_t *)&p->store[i])[3],
3669 ((uint32_t *)&p->store[i])[2],
3670 ((uint32_t *)&p->store[i])[1],
3671 ((uint32_t *)&p->store[i])[0]);
3672 brw_disasm(stdout, &p->store[i], intel->gen);
3673 }
3674 }
3675 }
3676 }
3677
3678 GLboolean
3679 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3680 {
3681 struct intel_context *intel = &brw->intel;
3682 struct gl_context *ctx = &intel->ctx;
3683 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3684
3685 if (!prog)
3686 return GL_FALSE;
3687
3688 struct brw_shader *shader =
3689 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3690 if (!shader)
3691 return GL_FALSE;
3692
3693 /* We always use 8-wide mode, at least for now. For one, flow
3694 * control only works in 8-wide. Also, when we're fragment shader
3695 * bound, we're almost always under register pressure as well, so
3696 * 8-wide would save us from the performance cliff of spilling
3697 * regs.
3698 */
3699 c->dispatch_width = 8;
3700
3701 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3702 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3703 _mesa_print_ir(shader->ir, NULL);
3704 printf("\n");
3705 }
3706
3707 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3708 */
3709 fs_visitor v(c, shader);
3710
3711 if (0) {
3712 v.emit_dummy_fs();
3713 } else {
3714 v.calculate_urb_setup();
3715 if (intel->gen < 6)
3716 v.emit_interpolation_setup_gen4();
3717 else
3718 v.emit_interpolation_setup_gen6();
3719
3720 /* Generate FS IR for main(). (the visitor only descends into
3721 * functions called "main").
3722 */
3723 foreach_iter(exec_list_iterator, iter, *shader->ir) {
3724 ir_instruction *ir = (ir_instruction *)iter.get();
3725 v.base_ir = ir;
3726 ir->accept(&v);
3727 }
3728
3729 v.emit_fb_writes();
3730
3731 v.split_virtual_grfs();
3732
3733 v.setup_paramvalues_refs();
3734 v.setup_pull_constants();
3735
3736 bool progress;
3737 do {
3738 progress = false;
3739
3740 progress = v.remove_duplicate_mrf_writes() || progress;
3741
3742 progress = v.propagate_constants() || progress;
3743 progress = v.register_coalesce() || progress;
3744 progress = v.compute_to_mrf() || progress;
3745 progress = v.dead_code_eliminate() || progress;
3746 } while (progress);
3747
3748 v.schedule_instructions();
3749
3750 v.assign_curb_setup();
3751 v.assign_urb_setup();
3752
3753 if (0) {
3754 /* Debug of register spilling: Go spill everything. */
3755 int virtual_grf_count = v.virtual_grf_next;
3756 for (int i = 1; i < virtual_grf_count; i++) {
3757 v.spill_reg(i);
3758 }
3759 }
3760
3761 if (0)
3762 v.assign_regs_trivial();
3763 else {
3764 while (!v.assign_regs()) {
3765 if (v.fail)
3766 break;
3767 }
3768 }
3769 }
3770
3771 if (!v.fail)
3772 v.generate_code();
3773
3774 assert(!v.fail); /* FINISHME: Cleanly fail, tested at link time, etc. */
3775
3776 if (v.fail)
3777 return GL_FALSE;
3778
3779 c->prog_data.total_grf = v.grf_used;
3780
3781 return GL_TRUE;
3782 }