2 * Copyright © 2010 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
33 #include <sys/types.h>
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "program/prog_parameter.h"
39 #include "program/prog_print.h"
40 #include "program/register_allocate.h"
41 #include "program/sampler.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
47 #include "brw_shader.h"
49 #include "../glsl/glsl_types.h"
50 #include "../glsl/ir_print_visitor.h"
52 #define MAX_INSTRUCTION (1 << 30)
55 fs_visitor::type_size(const struct glsl_type
*type
)
59 switch (type
->base_type
) {
64 return type
->components();
66 return type_size(type
->fields
.array
) * type
->length
;
67 case GLSL_TYPE_STRUCT
:
69 for (i
= 0; i
< type
->length
; i
++) {
70 size
+= type_size(type
->fields
.structure
[i
].type
);
73 case GLSL_TYPE_SAMPLER
:
74 /* Samplers take up no register space, since they're baked in at
79 assert(!"not reached");
85 fs_visitor::fail(const char *format
, ...)
90 if (INTEL_DEBUG
& DEBUG_WM
) {
91 fprintf(stderr
, "FS compile failed: ");
95 vfprintf(stderr
, format
, va
);
102 fs_visitor::push_force_uncompressed()
104 force_uncompressed_stack
++;
108 fs_visitor::pop_force_uncompressed()
110 force_uncompressed_stack
--;
111 assert(force_uncompressed_stack
>= 0);
115 fs_visitor::push_force_sechalf()
117 force_sechalf_stack
++;
121 fs_visitor::pop_force_sechalf()
123 force_sechalf_stack
--;
124 assert(force_sechalf_stack
>= 0);
128 * Returns how many MRFs an FS opcode will write over.
130 * Note that this is not the 0 or 1 implied writes in an actual gen
131 * instruction -- the FS opcodes often generate MOVs in addition.
134 fs_visitor::implied_mrf_writes(fs_inst
*inst
)
139 switch (inst
->opcode
) {
147 return 1 * c
->dispatch_width
/ 8;
149 return 2 * c
->dispatch_width
/ 8;
155 case FS_OPCODE_FB_WRITE
:
157 case FS_OPCODE_PULL_CONSTANT_LOAD
:
158 case FS_OPCODE_UNSPILL
:
160 case FS_OPCODE_SPILL
:
163 assert(!"not reached");
169 fs_visitor::virtual_grf_alloc(int size
)
171 if (virtual_grf_array_size
<= virtual_grf_next
) {
172 if (virtual_grf_array_size
== 0)
173 virtual_grf_array_size
= 16;
175 virtual_grf_array_size
*= 2;
176 virtual_grf_sizes
= reralloc(mem_ctx
, virtual_grf_sizes
, int,
177 virtual_grf_array_size
);
179 /* This slot is always unused. */
180 virtual_grf_sizes
[0] = 0;
182 virtual_grf_sizes
[virtual_grf_next
] = size
;
183 return virtual_grf_next
++;
186 /** Fixed HW reg constructor. */
187 fs_reg::fs_reg(enum register_file file
, int hw_reg
)
191 this->hw_reg
= hw_reg
;
192 this->type
= BRW_REGISTER_TYPE_F
;
195 /** Fixed HW reg constructor. */
196 fs_reg::fs_reg(enum register_file file
, int hw_reg
, uint32_t type
)
200 this->hw_reg
= hw_reg
;
204 /** Automatic reg constructor. */
205 fs_reg::fs_reg(class fs_visitor
*v
, const struct glsl_type
*type
)
210 this->reg
= v
->virtual_grf_alloc(v
->type_size(type
));
211 this->reg_offset
= 0;
212 this->type
= brw_type_for_base_type(type
);
216 fs_visitor::variable_storage(ir_variable
*var
)
218 return (fs_reg
*)hash_table_find(this->variable_ht
, var
);
222 import_uniforms_callback(const void *key
,
226 struct hash_table
*dst_ht
= (struct hash_table
*)closure
;
227 const fs_reg
*reg
= (const fs_reg
*)data
;
229 if (reg
->file
!= UNIFORM
)
232 hash_table_insert(dst_ht
, data
, key
);
235 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
236 * This brings in those uniform definitions
239 fs_visitor::import_uniforms(struct hash_table
*src_variable_ht
)
241 hash_table_call_foreach(src_variable_ht
,
242 import_uniforms_callback
,
246 /* Our support for uniforms is piggy-backed on the struct
247 * gl_fragment_program, because that's where the values actually
248 * get stored, rather than in some global gl_shader_program uniform
252 fs_visitor::setup_uniform_values(int loc
, const glsl_type
*type
)
254 unsigned int offset
= 0;
256 if (type
->is_matrix()) {
257 const glsl_type
*column
= glsl_type::get_instance(GLSL_TYPE_FLOAT
,
258 type
->vector_elements
,
261 for (unsigned int i
= 0; i
< type
->matrix_columns
; i
++) {
262 offset
+= setup_uniform_values(loc
+ offset
, column
);
268 switch (type
->base_type
) {
269 case GLSL_TYPE_FLOAT
:
273 for (unsigned int i
= 0; i
< type
->vector_elements
; i
++) {
274 unsigned int param
= c
->prog_data
.nr_params
++;
276 assert(param
< ARRAY_SIZE(c
->prog_data
.param
));
278 switch (type
->base_type
) {
279 case GLSL_TYPE_FLOAT
:
280 c
->prog_data
.param_convert
[param
] = PARAM_NO_CONVERT
;
283 c
->prog_data
.param_convert
[param
] = PARAM_CONVERT_F2U
;
286 c
->prog_data
.param_convert
[param
] = PARAM_CONVERT_F2I
;
289 c
->prog_data
.param_convert
[param
] = PARAM_CONVERT_F2B
;
292 assert(!"not reached");
293 c
->prog_data
.param_convert
[param
] = PARAM_NO_CONVERT
;
296 this->param_index
[param
] = loc
;
297 this->param_offset
[param
] = i
;
301 case GLSL_TYPE_STRUCT
:
302 for (unsigned int i
= 0; i
< type
->length
; i
++) {
303 offset
+= setup_uniform_values(loc
+ offset
,
304 type
->fields
.structure
[i
].type
);
308 case GLSL_TYPE_ARRAY
:
309 for (unsigned int i
= 0; i
< type
->length
; i
++) {
310 offset
+= setup_uniform_values(loc
+ offset
, type
->fields
.array
);
314 case GLSL_TYPE_SAMPLER
:
315 /* The sampler takes up a slot, but we don't use any values from it. */
319 assert(!"not reached");
325 /* Our support for builtin uniforms is even scarier than non-builtin.
326 * It sits on top of the PROG_STATE_VAR parameters that are
327 * automatically updated from GL context state.
330 fs_visitor::setup_builtin_uniform_values(ir_variable
*ir
)
332 const ir_state_slot
*const slots
= ir
->state_slots
;
333 assert(ir
->state_slots
!= NULL
);
335 for (unsigned int i
= 0; i
< ir
->num_state_slots
; i
++) {
336 /* This state reference has already been setup by ir_to_mesa, but we'll
337 * get the same index back here.
339 int index
= _mesa_add_state_reference(this->fp
->Base
.Parameters
,
340 (gl_state_index
*)slots
[i
].tokens
);
342 /* Add each of the unique swizzles of the element as a parameter.
343 * This'll end up matching the expected layout of the
344 * array/matrix/structure we're trying to fill in.
347 for (unsigned int j
= 0; j
< 4; j
++) {
348 int swiz
= GET_SWZ(slots
[i
].swizzle
, j
);
349 if (swiz
== last_swiz
)
353 c
->prog_data
.param_convert
[c
->prog_data
.nr_params
] =
355 this->param_index
[c
->prog_data
.nr_params
] = index
;
356 this->param_offset
[c
->prog_data
.nr_params
] = swiz
;
357 c
->prog_data
.nr_params
++;
363 fs_visitor::emit_fragcoord_interpolation(ir_variable
*ir
)
365 fs_reg
*reg
= new(this->mem_ctx
) fs_reg(this, ir
->type
);
367 bool flip
= !ir
->origin_upper_left
^ c
->key
.render_to_fbo
;
370 if (ir
->pixel_center_integer
) {
371 emit(BRW_OPCODE_MOV
, wpos
, this->pixel_x
);
373 emit(BRW_OPCODE_ADD
, wpos
, this->pixel_x
, fs_reg(0.5f
));
378 if (!flip
&& ir
->pixel_center_integer
) {
379 emit(BRW_OPCODE_MOV
, wpos
, this->pixel_y
);
381 fs_reg pixel_y
= this->pixel_y
;
382 float offset
= (ir
->pixel_center_integer
? 0.0 : 0.5);
385 pixel_y
.negate
= true;
386 offset
+= c
->key
.drawable_height
- 1.0;
389 emit(BRW_OPCODE_ADD
, wpos
, pixel_y
, fs_reg(offset
));
394 if (intel
->gen
>= 6) {
395 emit(BRW_OPCODE_MOV
, wpos
,
396 fs_reg(brw_vec8_grf(c
->source_depth_reg
, 0)));
398 emit(FS_OPCODE_LINTERP
, wpos
, this->delta_x
, this->delta_y
,
399 interp_reg(FRAG_ATTRIB_WPOS
, 2));
403 /* gl_FragCoord.w: Already set up in emit_interpolation */
404 emit(BRW_OPCODE_MOV
, wpos
, this->wpos_w
);
410 fs_visitor::emit_general_interpolation(ir_variable
*ir
)
412 fs_reg
*reg
= new(this->mem_ctx
) fs_reg(this, ir
->type
);
413 /* Interpolation is always in floating point regs. */
414 reg
->type
= BRW_REGISTER_TYPE_F
;
417 unsigned int array_elements
;
418 const glsl_type
*type
;
420 if (ir
->type
->is_array()) {
421 array_elements
= ir
->type
->length
;
422 if (array_elements
== 0) {
423 fail("dereferenced array '%s' has length 0\n", ir
->name
);
425 type
= ir
->type
->fields
.array
;
431 int location
= ir
->location
;
432 for (unsigned int i
= 0; i
< array_elements
; i
++) {
433 for (unsigned int j
= 0; j
< type
->matrix_columns
; j
++) {
434 if (urb_setup
[location
] == -1) {
435 /* If there's no incoming setup data for this slot, don't
436 * emit interpolation for it.
438 attr
.reg_offset
+= type
->vector_elements
;
444 location
== FRAG_ATTRIB_COL0
|| location
== FRAG_ATTRIB_COL1
;
446 if (c
->key
.flat_shade
&& is_gl_Color
) {
447 /* Constant interpolation (flat shading) case. The SF has
448 * handed us defined values in only the constant offset
449 * field of the setup reg.
451 for (unsigned int k
= 0; k
< type
->vector_elements
; k
++) {
452 struct brw_reg interp
= interp_reg(location
, k
);
453 interp
= suboffset(interp
, 3);
454 emit(FS_OPCODE_CINTERP
, attr
, fs_reg(interp
));
458 /* Perspective interpolation case. */
459 for (unsigned int k
= 0; k
< type
->vector_elements
; k
++) {
460 struct brw_reg interp
= interp_reg(location
, k
);
461 emit(FS_OPCODE_LINTERP
, attr
,
462 this->delta_x
, this->delta_y
, fs_reg(interp
));
466 if (intel
->gen
< 6) {
467 attr
.reg_offset
-= type
->vector_elements
;
468 for (unsigned int k
= 0; k
< type
->vector_elements
; k
++) {
469 emit(BRW_OPCODE_MUL
, attr
, attr
, this->pixel_w
);
482 fs_visitor::emit_frontfacing_interpolation(ir_variable
*ir
)
484 fs_reg
*reg
= new(this->mem_ctx
) fs_reg(this, ir
->type
);
486 /* The frontfacing comes in as a bit in the thread payload. */
487 if (intel
->gen
>= 6) {
488 emit(BRW_OPCODE_ASR
, *reg
,
489 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D
)),
491 emit(BRW_OPCODE_NOT
, *reg
, *reg
);
492 emit(BRW_OPCODE_AND
, *reg
, *reg
, fs_reg(1));
494 struct brw_reg r1_6ud
= retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD
);
495 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
498 fs_inst
*inst
= emit(BRW_OPCODE_CMP
, *reg
,
501 inst
->conditional_mod
= BRW_CONDITIONAL_L
;
502 emit(BRW_OPCODE_AND
, *reg
, *reg
, fs_reg(1u));
509 fs_visitor::emit_math(fs_opcodes opcode
, fs_reg dst
, fs_reg src
)
521 assert(!"not reached: bad math opcode");
525 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
526 * might be able to do better by doing execsize = 1 math and then
527 * expanding that result out, but we would need to be careful with
530 * The hardware ignores source modifiers (negate and abs) on math
531 * instructions, so we also move to a temp to set those up.
533 if (intel
->gen
>= 6 && (src
.file
== UNIFORM
||
536 fs_reg expanded
= fs_reg(this, glsl_type::float_type
);
537 emit(BRW_OPCODE_MOV
, expanded
, src
);
541 fs_inst
*inst
= emit(opcode
, dst
, src
);
543 if (intel
->gen
< 6) {
545 inst
->mlen
= c
->dispatch_width
/ 8;
552 fs_visitor::emit_math(fs_opcodes opcode
, fs_reg dst
, fs_reg src0
, fs_reg src1
)
557 assert(opcode
== FS_OPCODE_POW
);
559 if (intel
->gen
>= 6) {
560 /* Can't do hstride == 0 args to gen6 math, so expand it out.
562 * The hardware ignores source modifiers (negate and abs) on math
563 * instructions, so we also move to a temp to set those up.
565 if (src0
.file
== UNIFORM
|| src0
.abs
|| src0
.negate
) {
566 fs_reg expanded
= fs_reg(this, glsl_type::float_type
);
567 emit(BRW_OPCODE_MOV
, expanded
, src0
);
571 if (src1
.file
== UNIFORM
|| src1
.abs
|| src1
.negate
) {
572 fs_reg expanded
= fs_reg(this, glsl_type::float_type
);
573 emit(BRW_OPCODE_MOV
, expanded
, src1
);
577 inst
= emit(opcode
, dst
, src0
, src1
);
579 emit(BRW_OPCODE_MOV
, fs_reg(MRF
, base_mrf
+ 1), src1
);
580 inst
= emit(opcode
, dst
, src0
, reg_null_f
);
582 inst
->base_mrf
= base_mrf
;
583 inst
->mlen
= 2 * c
->dispatch_width
/ 8;
589 * To be called after the last _mesa_add_state_reference() call, to
590 * set up prog_data.param[] for assign_curb_setup() and
591 * setup_pull_constants().
594 fs_visitor::setup_paramvalues_refs()
596 if (c
->dispatch_width
!= 8)
599 /* Set up the pointers to ParamValues now that that array is finalized. */
600 for (unsigned int i
= 0; i
< c
->prog_data
.nr_params
; i
++) {
601 c
->prog_data
.param
[i
] =
602 fp
->Base
.Parameters
->ParameterValues
[this->param_index
[i
]] +
603 this->param_offset
[i
];
608 fs_visitor::assign_curb_setup()
610 c
->prog_data
.curb_read_length
= ALIGN(c
->prog_data
.nr_params
, 8) / 8;
611 if (c
->dispatch_width
== 8) {
612 c
->prog_data
.first_curbe_grf
= c
->nr_payload_regs
;
614 c
->prog_data
.first_curbe_grf_16
= c
->nr_payload_regs
;
617 /* Map the offsets in the UNIFORM file to fixed HW regs. */
618 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
619 fs_inst
*inst
= (fs_inst
*)iter
.get();
621 for (unsigned int i
= 0; i
< 3; i
++) {
622 if (inst
->src
[i
].file
== UNIFORM
) {
623 int constant_nr
= inst
->src
[i
].hw_reg
+ inst
->src
[i
].reg_offset
;
624 struct brw_reg brw_reg
= brw_vec1_grf(c
->nr_payload_regs
+
628 inst
->src
[i
].file
= FIXED_HW_REG
;
629 inst
->src
[i
].fixed_hw_reg
= retype(brw_reg
, inst
->src
[i
].type
);
636 fs_visitor::calculate_urb_setup()
638 for (unsigned int i
= 0; i
< FRAG_ATTRIB_MAX
; i
++) {
643 /* Figure out where each of the incoming setup attributes lands. */
644 if (intel
->gen
>= 6) {
645 for (unsigned int i
= 0; i
< FRAG_ATTRIB_MAX
; i
++) {
646 if (brw
->fragment_program
->Base
.InputsRead
& BITFIELD64_BIT(i
)) {
647 urb_setup
[i
] = urb_next
++;
651 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
652 for (unsigned int i
= 0; i
< VERT_RESULT_MAX
; i
++) {
653 if (c
->key
.vp_outputs_written
& BITFIELD64_BIT(i
)) {
656 if (i
>= VERT_RESULT_VAR0
)
657 fp_index
= i
- (VERT_RESULT_VAR0
- FRAG_ATTRIB_VAR0
);
658 else if (i
<= VERT_RESULT_TEX7
)
664 urb_setup
[fp_index
] = urb_next
++;
669 /* Each attribute is 4 setup channels, each of which is half a reg. */
670 c
->prog_data
.urb_read_length
= urb_next
* 2;
674 fs_visitor::assign_urb_setup()
676 int urb_start
= c
->nr_payload_regs
+ c
->prog_data
.curb_read_length
;
678 /* Offset all the urb_setup[] index by the actual position of the
679 * setup regs, now that the location of the constants has been chosen.
681 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
682 fs_inst
*inst
= (fs_inst
*)iter
.get();
684 if (inst
->opcode
== FS_OPCODE_LINTERP
) {
685 assert(inst
->src
[2].file
== FIXED_HW_REG
);
686 inst
->src
[2].fixed_hw_reg
.nr
+= urb_start
;
689 if (inst
->opcode
== FS_OPCODE_CINTERP
) {
690 assert(inst
->src
[0].file
== FIXED_HW_REG
);
691 inst
->src
[0].fixed_hw_reg
.nr
+= urb_start
;
695 this->first_non_payload_grf
= urb_start
+ c
->prog_data
.urb_read_length
;
699 * Split large virtual GRFs into separate components if we can.
701 * This is mostly duplicated with what brw_fs_vector_splitting does,
702 * but that's really conservative because it's afraid of doing
703 * splitting that doesn't result in real progress after the rest of
704 * the optimization phases, which would cause infinite looping in
705 * optimization. We can do it once here, safely. This also has the
706 * opportunity to split interpolated values, or maybe even uniforms,
707 * which we don't have at the IR level.
709 * We want to split, because virtual GRFs are what we register
710 * allocate and spill (due to contiguousness requirements for some
711 * instructions), and they're what we naturally generate in the
712 * codegen process, but most virtual GRFs don't actually need to be
713 * contiguous sets of GRFs. If we split, we'll end up with reduced
714 * live intervals and better dead code elimination and coalescing.
717 fs_visitor::split_virtual_grfs()
719 int num_vars
= this->virtual_grf_next
;
720 bool split_grf
[num_vars
];
721 int new_virtual_grf
[num_vars
];
723 /* Try to split anything > 0 sized. */
724 for (int i
= 0; i
< num_vars
; i
++) {
725 if (this->virtual_grf_sizes
[i
] != 1)
728 split_grf
[i
] = false;
732 /* PLN opcodes rely on the delta_xy being contiguous. */
733 split_grf
[this->delta_x
.reg
] = false;
736 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
737 fs_inst
*inst
= (fs_inst
*)iter
.get();
739 /* Texturing produces 4 contiguous registers, so no splitting. */
740 if (inst
->is_tex()) {
741 split_grf
[inst
->dst
.reg
] = false;
745 /* Allocate new space for split regs. Note that the virtual
746 * numbers will be contiguous.
748 for (int i
= 0; i
< num_vars
; i
++) {
750 new_virtual_grf
[i
] = virtual_grf_alloc(1);
751 for (int j
= 2; j
< this->virtual_grf_sizes
[i
]; j
++) {
752 int reg
= virtual_grf_alloc(1);
753 assert(reg
== new_virtual_grf
[i
] + j
- 1);
756 this->virtual_grf_sizes
[i
] = 1;
760 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
761 fs_inst
*inst
= (fs_inst
*)iter
.get();
763 if (inst
->dst
.file
== GRF
&&
764 split_grf
[inst
->dst
.reg
] &&
765 inst
->dst
.reg_offset
!= 0) {
766 inst
->dst
.reg
= (new_virtual_grf
[inst
->dst
.reg
] +
767 inst
->dst
.reg_offset
- 1);
768 inst
->dst
.reg_offset
= 0;
770 for (int i
= 0; i
< 3; i
++) {
771 if (inst
->src
[i
].file
== GRF
&&
772 split_grf
[inst
->src
[i
].reg
] &&
773 inst
->src
[i
].reg_offset
!= 0) {
774 inst
->src
[i
].reg
= (new_virtual_grf
[inst
->src
[i
].reg
] +
775 inst
->src
[i
].reg_offset
- 1);
776 inst
->src
[i
].reg_offset
= 0;
780 this->live_intervals_valid
= false;
784 * Choose accesses from the UNIFORM file to demote to using the pull
787 * We allow a fragment shader to have more than the specified minimum
788 * maximum number of fragment shader uniform components (64). If
789 * there are too many of these, they'd fill up all of register space.
790 * So, this will push some of them out to the pull constant buffer and
791 * update the program to load them.
794 fs_visitor::setup_pull_constants()
796 /* Only allow 16 registers (128 uniform components) as push constants. */
797 unsigned int max_uniform_components
= 16 * 8;
798 if (c
->prog_data
.nr_params
<= max_uniform_components
)
801 if (c
->dispatch_width
== 16) {
802 fail("Pull constants not supported in 16-wide\n");
806 /* Just demote the end of the list. We could probably do better
807 * here, demoting things that are rarely used in the program first.
809 int pull_uniform_base
= max_uniform_components
;
810 int pull_uniform_count
= c
->prog_data
.nr_params
- pull_uniform_base
;
812 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
813 fs_inst
*inst
= (fs_inst
*)iter
.get();
815 for (int i
= 0; i
< 3; i
++) {
816 if (inst
->src
[i
].file
!= UNIFORM
)
819 int uniform_nr
= inst
->src
[i
].hw_reg
+ inst
->src
[i
].reg_offset
;
820 if (uniform_nr
< pull_uniform_base
)
823 fs_reg dst
= fs_reg(this, glsl_type::float_type
);
824 fs_inst
*pull
= new(mem_ctx
) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD
,
826 pull
->offset
= ((uniform_nr
- pull_uniform_base
) * 4) & ~15;
828 pull
->annotation
= inst
->annotation
;
832 inst
->insert_before(pull
);
834 inst
->src
[i
].file
= GRF
;
835 inst
->src
[i
].reg
= dst
.reg
;
836 inst
->src
[i
].reg_offset
= 0;
837 inst
->src
[i
].smear
= (uniform_nr
- pull_uniform_base
) & 3;
841 for (int i
= 0; i
< pull_uniform_count
; i
++) {
842 c
->prog_data
.pull_param
[i
] = c
->prog_data
.param
[pull_uniform_base
+ i
];
843 c
->prog_data
.pull_param_convert
[i
] =
844 c
->prog_data
.param_convert
[pull_uniform_base
+ i
];
846 c
->prog_data
.nr_params
-= pull_uniform_count
;
847 c
->prog_data
.nr_pull_params
= pull_uniform_count
;
851 fs_visitor::calculate_live_intervals()
853 int num_vars
= this->virtual_grf_next
;
854 int *def
= ralloc_array(mem_ctx
, int, num_vars
);
855 int *use
= ralloc_array(mem_ctx
, int, num_vars
);
859 if (this->live_intervals_valid
)
862 for (int i
= 0; i
< num_vars
; i
++) {
863 def
[i
] = MAX_INSTRUCTION
;
868 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
869 fs_inst
*inst
= (fs_inst
*)iter
.get();
871 if (inst
->opcode
== BRW_OPCODE_DO
) {
872 if (loop_depth
++ == 0)
874 } else if (inst
->opcode
== BRW_OPCODE_WHILE
) {
877 if (loop_depth
== 0) {
878 /* Patches up the use of vars marked for being live across
881 for (int i
= 0; i
< num_vars
; i
++) {
882 if (use
[i
] == loop_start
) {
888 for (unsigned int i
= 0; i
< 3; i
++) {
889 if (inst
->src
[i
].file
== GRF
&& inst
->src
[i
].reg
!= 0) {
890 int reg
= inst
->src
[i
].reg
;
895 def
[reg
] = MIN2(loop_start
, def
[reg
]);
896 use
[reg
] = loop_start
;
898 /* Nobody else is going to go smash our start to
899 * later in the loop now, because def[reg] now
900 * points before the bb header.
905 if (inst
->dst
.file
== GRF
&& inst
->dst
.reg
!= 0) {
906 int reg
= inst
->dst
.reg
;
909 def
[reg
] = MIN2(def
[reg
], ip
);
911 def
[reg
] = MIN2(def
[reg
], loop_start
);
919 ralloc_free(this->virtual_grf_def
);
920 ralloc_free(this->virtual_grf_use
);
921 this->virtual_grf_def
= def
;
922 this->virtual_grf_use
= use
;
924 this->live_intervals_valid
= true;
928 * Attempts to move immediate constants into the immediate
929 * constant slot of following instructions.
931 * Immediate constants are a bit tricky -- they have to be in the last
932 * operand slot, you can't do abs/negate on them,
936 fs_visitor::propagate_constants()
938 bool progress
= false;
940 calculate_live_intervals();
942 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
943 fs_inst
*inst
= (fs_inst
*)iter
.get();
945 if (inst
->opcode
!= BRW_OPCODE_MOV
||
947 inst
->dst
.file
!= GRF
|| inst
->src
[0].file
!= IMM
||
948 inst
->dst
.type
!= inst
->src
[0].type
||
949 (c
->dispatch_width
== 16 &&
950 (inst
->force_uncompressed
|| inst
->force_sechalf
)))
953 /* Don't bother with cases where we should have had the
954 * operation on the constant folded in GLSL already.
959 /* Found a move of a constant to a GRF. Find anything else using the GRF
960 * before it's written, and replace it with the constant if we can.
962 exec_list_iterator scan_iter
= iter
;
964 for (; scan_iter
.has_next(); scan_iter
.next()) {
965 fs_inst
*scan_inst
= (fs_inst
*)scan_iter
.get();
967 if (scan_inst
->opcode
== BRW_OPCODE_DO
||
968 scan_inst
->opcode
== BRW_OPCODE_WHILE
||
969 scan_inst
->opcode
== BRW_OPCODE_ELSE
||
970 scan_inst
->opcode
== BRW_OPCODE_ENDIF
) {
974 for (int i
= 2; i
>= 0; i
--) {
975 if (scan_inst
->src
[i
].file
!= GRF
||
976 scan_inst
->src
[i
].reg
!= inst
->dst
.reg
||
977 scan_inst
->src
[i
].reg_offset
!= inst
->dst
.reg_offset
)
980 /* Don't bother with cases where we should have had the
981 * operation on the constant folded in GLSL already.
983 if (scan_inst
->src
[i
].negate
|| scan_inst
->src
[i
].abs
)
986 switch (scan_inst
->opcode
) {
988 scan_inst
->src
[i
] = inst
->src
[0];
995 scan_inst
->src
[i
] = inst
->src
[0];
997 } else if (i
== 0 && scan_inst
->src
[1].file
!= IMM
) {
998 /* Fit this constant in by commuting the operands */
999 scan_inst
->src
[0] = scan_inst
->src
[1];
1000 scan_inst
->src
[1] = inst
->src
[0];
1005 case BRW_OPCODE_CMP
:
1007 scan_inst
->src
[i
] = inst
->src
[0];
1009 } else if (i
== 0 && scan_inst
->src
[1].file
!= IMM
) {
1012 new_cmod
= brw_swap_cmod(scan_inst
->conditional_mod
);
1013 if (new_cmod
!= ~0u) {
1014 /* Fit this constant in by swapping the operands and
1017 scan_inst
->src
[0] = scan_inst
->src
[1];
1018 scan_inst
->src
[1] = inst
->src
[0];
1019 scan_inst
->conditional_mod
= new_cmod
;
1025 case BRW_OPCODE_SEL
:
1027 scan_inst
->src
[i
] = inst
->src
[0];
1029 } else if (i
== 0 && scan_inst
->src
[1].file
!= IMM
) {
1030 /* Fit this constant in by swapping the operands and
1031 * flipping the predicate
1033 scan_inst
->src
[0] = scan_inst
->src
[1];
1034 scan_inst
->src
[1] = inst
->src
[0];
1035 scan_inst
->predicate_inverse
= !scan_inst
->predicate_inverse
;
1042 if (scan_inst
->dst
.file
== GRF
&&
1043 scan_inst
->dst
.reg
== inst
->dst
.reg
&&
1044 (scan_inst
->dst
.reg_offset
== inst
->dst
.reg_offset
||
1045 scan_inst
->is_tex())) {
1052 this->live_intervals_valid
= false;
1057 * Must be called after calculate_live_intervales() to remove unused
1058 * writes to registers -- register allocation will fail otherwise
1059 * because something deffed but not used won't be considered to
1060 * interfere with other regs.
1063 fs_visitor::dead_code_eliminate()
1065 bool progress
= false;
1068 calculate_live_intervals();
1070 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
1071 fs_inst
*inst
= (fs_inst
*)iter
.get();
1073 if (inst
->dst
.file
== GRF
&& this->virtual_grf_use
[inst
->dst
.reg
] <= pc
) {
1082 live_intervals_valid
= false;
1088 fs_visitor::register_coalesce()
1090 bool progress
= false;
1094 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
1095 fs_inst
*inst
= (fs_inst
*)iter
.get();
1097 /* Make sure that we dominate the instructions we're going to
1098 * scan for interfering with our coalescing, or we won't have
1099 * scanned enough to see if anything interferes with our
1100 * coalescing. We don't dominate the following instructions if
1101 * we're in a loop or an if block.
1103 switch (inst
->opcode
) {
1107 case BRW_OPCODE_WHILE
:
1113 case BRW_OPCODE_ENDIF
:
1117 if (loop_depth
|| if_depth
)
1120 if (inst
->opcode
!= BRW_OPCODE_MOV
||
1123 inst
->dst
.file
!= GRF
|| inst
->src
[0].file
!= GRF
||
1124 inst
->dst
.type
!= inst
->src
[0].type
)
1127 bool has_source_modifiers
= inst
->src
[0].abs
|| inst
->src
[0].negate
;
1129 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1130 * them: check for no writes to either one until the exit of the
1133 bool interfered
= false;
1134 exec_list_iterator scan_iter
= iter
;
1136 for (; scan_iter
.has_next(); scan_iter
.next()) {
1137 fs_inst
*scan_inst
= (fs_inst
*)scan_iter
.get();
1139 if (scan_inst
->dst
.file
== GRF
) {
1140 if (scan_inst
->dst
.reg
== inst
->dst
.reg
&&
1141 (scan_inst
->dst
.reg_offset
== inst
->dst
.reg_offset
||
1142 scan_inst
->is_tex())) {
1146 if (scan_inst
->dst
.reg
== inst
->src
[0].reg
&&
1147 (scan_inst
->dst
.reg_offset
== inst
->src
[0].reg_offset
||
1148 scan_inst
->is_tex())) {
1154 /* The gen6 MATH instruction can't handle source modifiers, so avoid
1155 * coalescing those for now. We should do something more specific.
1157 if (intel
->gen
>= 6 && scan_inst
->is_math() && has_source_modifiers
) {
1166 /* Rewrite the later usage to point at the source of the move to
1169 for (exec_list_iterator scan_iter
= iter
; scan_iter
.has_next();
1171 fs_inst
*scan_inst
= (fs_inst
*)scan_iter
.get();
1173 for (int i
= 0; i
< 3; i
++) {
1174 if (scan_inst
->src
[i
].file
== GRF
&&
1175 scan_inst
->src
[i
].reg
== inst
->dst
.reg
&&
1176 scan_inst
->src
[i
].reg_offset
== inst
->dst
.reg_offset
) {
1177 scan_inst
->src
[i
].reg
= inst
->src
[0].reg
;
1178 scan_inst
->src
[i
].reg_offset
= inst
->src
[0].reg_offset
;
1179 scan_inst
->src
[i
].abs
|= inst
->src
[0].abs
;
1180 scan_inst
->src
[i
].negate
^= inst
->src
[0].negate
;
1181 scan_inst
->src
[i
].smear
= inst
->src
[0].smear
;
1191 live_intervals_valid
= false;
1198 fs_visitor::compute_to_mrf()
1200 bool progress
= false;
1203 calculate_live_intervals();
1205 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
1206 fs_inst
*inst
= (fs_inst
*)iter
.get();
1211 if (inst
->opcode
!= BRW_OPCODE_MOV
||
1213 inst
->dst
.file
!= MRF
|| inst
->src
[0].file
!= GRF
||
1214 inst
->dst
.type
!= inst
->src
[0].type
||
1215 inst
->src
[0].abs
|| inst
->src
[0].negate
|| inst
->src
[0].smear
!= -1)
1218 /* Work out which hardware MRF registers are written by this
1221 int mrf_low
= inst
->dst
.hw_reg
& ~BRW_MRF_COMPR4
;
1223 if (inst
->dst
.hw_reg
& BRW_MRF_COMPR4
) {
1224 mrf_high
= mrf_low
+ 4;
1225 } else if (c
->dispatch_width
== 16 &&
1226 (!inst
->force_uncompressed
&& !inst
->force_sechalf
)) {
1227 mrf_high
= mrf_low
+ 1;
1232 /* Can't compute-to-MRF this GRF if someone else was going to
1235 if (this->virtual_grf_use
[inst
->src
[0].reg
] > ip
)
1238 /* Found a move of a GRF to a MRF. Let's see if we can go
1239 * rewrite the thing that made this GRF to write into the MRF.
1242 for (scan_inst
= (fs_inst
*)inst
->prev
;
1243 scan_inst
->prev
!= NULL
;
1244 scan_inst
= (fs_inst
*)scan_inst
->prev
) {
1245 if (scan_inst
->dst
.file
== GRF
&&
1246 scan_inst
->dst
.reg
== inst
->src
[0].reg
) {
1247 /* Found the last thing to write our reg we want to turn
1248 * into a compute-to-MRF.
1251 if (scan_inst
->is_tex()) {
1252 /* texturing writes several continuous regs, so we can't
1253 * compute-to-mrf that.
1258 /* If it's predicated, it (probably) didn't populate all
1259 * the channels. We might be able to rewrite everything
1260 * that writes that reg, but it would require smarter
1261 * tracking to delay the rewriting until complete success.
1263 if (scan_inst
->predicated
)
1266 /* If it's half of register setup and not the same half as
1267 * our MOV we're trying to remove, bail for now.
1269 if (scan_inst
->force_uncompressed
!= inst
->force_uncompressed
||
1270 scan_inst
->force_sechalf
!= inst
->force_sechalf
) {
1274 /* SEND instructions can't have MRF as a destination. */
1275 if (scan_inst
->mlen
)
1278 if (intel
->gen
>= 6) {
1279 /* gen6 math instructions must have the destination be
1280 * GRF, so no compute-to-MRF for them.
1282 if (scan_inst
->is_math()) {
1287 if (scan_inst
->dst
.reg_offset
== inst
->src
[0].reg_offset
) {
1288 /* Found the creator of our MRF's source value. */
1289 scan_inst
->dst
.file
= MRF
;
1290 scan_inst
->dst
.hw_reg
= inst
->dst
.hw_reg
;
1291 scan_inst
->saturate
|= inst
->saturate
;
1298 /* We don't handle flow control here. Most computation of
1299 * values that end up in MRFs are shortly before the MRF
1302 if (scan_inst
->opcode
== BRW_OPCODE_DO
||
1303 scan_inst
->opcode
== BRW_OPCODE_WHILE
||
1304 scan_inst
->opcode
== BRW_OPCODE_ELSE
||
1305 scan_inst
->opcode
== BRW_OPCODE_ENDIF
) {
1309 /* You can't read from an MRF, so if someone else reads our
1310 * MRF's source GRF that we wanted to rewrite, that stops us.
1312 bool interfered
= false;
1313 for (int i
= 0; i
< 3; i
++) {
1314 if (scan_inst
->src
[i
].file
== GRF
&&
1315 scan_inst
->src
[i
].reg
== inst
->src
[0].reg
&&
1316 scan_inst
->src
[i
].reg_offset
== inst
->src
[0].reg_offset
) {
1323 if (scan_inst
->dst
.file
== MRF
) {
1324 /* If somebody else writes our MRF here, we can't
1325 * compute-to-MRF before that.
1327 int scan_mrf_low
= scan_inst
->dst
.hw_reg
& ~BRW_MRF_COMPR4
;
1330 if (scan_inst
->dst
.hw_reg
& BRW_MRF_COMPR4
) {
1331 scan_mrf_high
= scan_mrf_low
+ 4;
1332 } else if (c
->dispatch_width
== 16 &&
1333 (!scan_inst
->force_uncompressed
&&
1334 !scan_inst
->force_sechalf
)) {
1335 scan_mrf_high
= scan_mrf_low
+ 1;
1337 scan_mrf_high
= scan_mrf_low
;
1340 if (mrf_low
== scan_mrf_low
||
1341 mrf_low
== scan_mrf_high
||
1342 mrf_high
== scan_mrf_low
||
1343 mrf_high
== scan_mrf_high
) {
1348 if (scan_inst
->mlen
> 0) {
1349 /* Found a SEND instruction, which means that there are
1350 * live values in MRFs from base_mrf to base_mrf +
1351 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1354 if (mrf_low
>= scan_inst
->base_mrf
&&
1355 mrf_low
< scan_inst
->base_mrf
+ scan_inst
->mlen
) {
1358 if (mrf_high
>= scan_inst
->base_mrf
&&
1359 mrf_high
< scan_inst
->base_mrf
+ scan_inst
->mlen
) {
1370 * Walks through basic blocks, locking for repeated MRF writes and
1371 * removing the later ones.
1374 fs_visitor::remove_duplicate_mrf_writes()
1376 fs_inst
*last_mrf_move
[16];
1377 bool progress
= false;
1379 /* Need to update the MRF tracking for compressed instructions. */
1380 if (c
->dispatch_width
== 16)
1383 memset(last_mrf_move
, 0, sizeof(last_mrf_move
));
1385 foreach_iter(exec_list_iterator
, iter
, this->instructions
) {
1386 fs_inst
*inst
= (fs_inst
*)iter
.get();
1388 switch (inst
->opcode
) {
1390 case BRW_OPCODE_WHILE
:
1392 case BRW_OPCODE_ELSE
:
1393 case BRW_OPCODE_ENDIF
:
1394 memset(last_mrf_move
, 0, sizeof(last_mrf_move
));
1400 if (inst
->opcode
== BRW_OPCODE_MOV
&&
1401 inst
->dst
.file
== MRF
) {
1402 fs_inst
*prev_inst
= last_mrf_move
[inst
->dst
.hw_reg
];
1403 if (prev_inst
&& inst
->equals(prev_inst
)) {
1410 /* Clear out the last-write records for MRFs that were overwritten. */
1411 if (inst
->dst
.file
== MRF
) {
1412 last_mrf_move
[inst
->dst
.hw_reg
] = NULL
;
1415 if (inst
->mlen
> 0) {
1416 /* Found a SEND instruction, which will include two or fewer
1417 * implied MRF writes. We could do better here.
1419 for (int i
= 0; i
< implied_mrf_writes(inst
); i
++) {
1420 last_mrf_move
[inst
->base_mrf
+ i
] = NULL
;
1424 /* Clear out any MRF move records whose sources got overwritten. */
1425 if (inst
->dst
.file
== GRF
) {
1426 for (unsigned int i
= 0; i
< Elements(last_mrf_move
); i
++) {
1427 if (last_mrf_move
[i
] &&
1428 last_mrf_move
[i
]->src
[0].reg
== inst
->dst
.reg
) {
1429 last_mrf_move
[i
] = NULL
;
1434 if (inst
->opcode
== BRW_OPCODE_MOV
&&
1435 inst
->dst
.file
== MRF
&&
1436 inst
->src
[0].file
== GRF
&&
1437 !inst
->predicated
) {
1438 last_mrf_move
[inst
->dst
.hw_reg
] = inst
;
1446 fs_visitor::virtual_grf_interferes(int a
, int b
)
1448 int start
= MAX2(this->virtual_grf_def
[a
], this->virtual_grf_def
[b
]);
1449 int end
= MIN2(this->virtual_grf_use
[a
], this->virtual_grf_use
[b
]);
1451 /* We can't handle dead register writes here, without iterating
1452 * over the whole instruction stream to find every single dead
1453 * write to that register to compare to the live interval of the
1454 * other register. Just assert that dead_code_eliminate() has been
1457 assert((this->virtual_grf_use
[a
] != -1 ||
1458 this->virtual_grf_def
[a
] == MAX_INSTRUCTION
) &&
1459 (this->virtual_grf_use
[b
] != -1 ||
1460 this->virtual_grf_def
[b
] == MAX_INSTRUCTION
));
1462 /* If the register is used to store 16 values of less than float
1463 * size (only the case for pixel_[xy]), then we can't allocate
1464 * another dword-sized thing to that register that would be used in
1465 * the same instruction. This is because when the GPU decodes (for
1468 * (declare (in ) vec4 gl_FragCoord@0x97766a0)
1469 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr };
1471 * it's actually processed as:
1472 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 };
1473 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf };
1475 * so our second half values in g6 got overwritten in the first
1478 if (c
->dispatch_width
== 16 && (this->pixel_x
.reg
== a
||
1479 this->pixel_x
.reg
== b
||
1480 this->pixel_y
.reg
== a
||
1481 this->pixel_y
.reg
== b
)) {
1482 return start
<= end
;
1491 uint32_t prog_offset_16
= 0;
1492 uint32_t orig_nr_params
= c
->prog_data
.nr_params
;
1494 brw_wm_payload_setup(brw
, c
);
1496 if (c
->dispatch_width
== 16) {
1497 /* align to 64 byte boundary. */
1498 while ((c
->func
.nr_insn
* sizeof(struct brw_instruction
)) % 64) {
1502 /* Save off the start of this 16-wide program in case we succeed. */
1503 prog_offset_16
= c
->func
.nr_insn
* sizeof(struct brw_instruction
);
1505 brw_set_compression_control(p
, BRW_COMPRESSION_COMPRESSED
);
1511 calculate_urb_setup();
1513 emit_interpolation_setup_gen4();
1515 emit_interpolation_setup_gen6();
1517 /* Generate FS IR for main(). (the visitor only descends into
1518 * functions called "main").
1520 foreach_iter(exec_list_iterator
, iter
, *shader
->ir
) {
1521 ir_instruction
*ir
= (ir_instruction
*)iter
.get();
1523 this->result
= reg_undef
;
1529 split_virtual_grfs();
1531 setup_paramvalues_refs();
1532 setup_pull_constants();
1538 progress
= remove_duplicate_mrf_writes() || progress
;
1540 progress
= propagate_constants() || progress
;
1541 progress
= register_coalesce() || progress
;
1542 progress
= compute_to_mrf() || progress
;
1543 progress
= dead_code_eliminate() || progress
;
1546 schedule_instructions();
1548 assign_curb_setup();
1552 /* Debug of register spilling: Go spill everything. */
1553 int virtual_grf_count
= virtual_grf_next
;
1554 for (int i
= 1; i
< virtual_grf_count
; i
++) {
1560 assign_regs_trivial();
1562 while (!assign_regs()) {
1568 assert(force_uncompressed_stack
== 0);
1569 assert(force_sechalf_stack
== 0);
1576 if (c
->dispatch_width
== 8) {
1577 c
->prog_data
.reg_blocks
= brw_register_blocks(grf_used
);
1579 c
->prog_data
.reg_blocks_16
= brw_register_blocks(grf_used
);
1580 c
->prog_data
.prog_offset_16
= prog_offset_16
;
1582 /* Make sure we didn't try to sneak in an extra uniform */
1583 assert(orig_nr_params
== c
->prog_data
.nr_params
);
1590 brw_wm_fs_emit(struct brw_context
*brw
, struct brw_wm_compile
*c
)
1592 struct intel_context
*intel
= &brw
->intel
;
1593 struct gl_context
*ctx
= &intel
->ctx
;
1594 struct gl_shader_program
*prog
= ctx
->Shader
.CurrentFragmentProgram
;
1599 struct brw_shader
*shader
=
1600 (brw_shader
*) prog
->_LinkedShaders
[MESA_SHADER_FRAGMENT
];
1604 if (unlikely(INTEL_DEBUG
& DEBUG_WM
)) {
1605 printf("GLSL IR for native fragment shader %d:\n", prog
->Name
);
1606 _mesa_print_ir(shader
->ir
, NULL
);
1610 /* Now the main event: Visit the shader IR and generate our FS IR for it.
1612 c
->dispatch_width
= 8;
1614 fs_visitor
v(c
, shader
);
1616 /* FINISHME: Cleanly fail, test at link time, etc. */
1617 assert(!"not reached");
1621 if (intel
->gen
>= 5 && c
->prog_data
.nr_pull_params
== 0) {
1622 c
->dispatch_width
= 16;
1623 fs_visitor
v2(c
, shader
);
1624 v2
.import_uniforms(v
.variable_ht
);
1628 c
->prog_data
.dispatch_width
= 8;