2 * Copyright © 2014 Intel Corporation
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
24 * Jason Ekstrand (jason@jlekstrand.net)
29 #include "nir_builder.h"
32 * Implements a simple pass that lowers vecN instructions to a series of
33 * moves with partial writes.
37 src_matches_dest_reg(nir_dest
*dest
, nir_src
*src
)
39 if (dest
->is_ssa
|| src
->is_ssa
)
42 return (dest
->reg
.reg
== src
->reg
.reg
&&
43 dest
->reg
.base_offset
== src
->reg
.base_offset
&&
44 !dest
->reg
.indirect
&&
49 * For a given starting writemask channel and corresponding source index in
50 * the vec instruction, insert a MOV to the vec instruction's dest of all the
51 * writemask channels that get read from the same src reg.
53 * Returns the writemask of our MOV, so the parent loop calling this knows
54 * which ones have been processed.
57 insert_mov(nir_alu_instr
*vec
, unsigned start_idx
, nir_shader
*shader
)
59 assert(start_idx
< nir_op_infos
[vec
->op
].num_inputs
);
61 nir_alu_instr
*mov
= nir_alu_instr_create(shader
, nir_op_mov
);
62 nir_alu_src_copy(&mov
->src
[0], &vec
->src
[start_idx
], mov
);
63 nir_alu_dest_copy(&mov
->dest
, &vec
->dest
, mov
);
65 mov
->dest
.write_mask
= (1u << start_idx
);
66 mov
->src
[0].swizzle
[start_idx
] = vec
->src
[start_idx
].swizzle
[0];
67 mov
->src
[0].negate
= vec
->src
[start_idx
].negate
;
68 mov
->src
[0].abs
= vec
->src
[start_idx
].abs
;
70 for (unsigned i
= start_idx
+ 1; i
< 4; i
++) {
71 if (!(vec
->dest
.write_mask
& (1 << i
)))
74 if (nir_srcs_equal(vec
->src
[i
].src
, vec
->src
[start_idx
].src
) &&
75 vec
->src
[i
].negate
== vec
->src
[start_idx
].negate
&&
76 vec
->src
[i
].abs
== vec
->src
[start_idx
].abs
) {
77 mov
->dest
.write_mask
|= (1 << i
);
78 mov
->src
[0].swizzle
[i
] = vec
->src
[i
].swizzle
[0];
82 unsigned channels_handled
= mov
->dest
.write_mask
;
84 /* In some situations (if the vecN is involved in a phi-web), we can end
85 * up with a mov from a register to itself. Some of those channels may end
86 * up doing nothing and there's no reason to have them as part of the mov.
88 if (src_matches_dest_reg(&mov
->dest
.dest
, &mov
->src
[0].src
) &&
89 !mov
->src
[0].abs
&& !mov
->src
[0].negate
) {
90 for (unsigned i
= 0; i
< 4; i
++) {
91 if (mov
->src
[0].swizzle
[i
] == i
) {
92 mov
->dest
.write_mask
&= ~(1 << i
);
97 /* Only emit the instruction if it actually does something */
98 if (mov
->dest
.write_mask
) {
99 nir_instr_insert_before(&vec
->instr
, &mov
->instr
);
104 return channels_handled
;
108 has_replicated_dest(nir_alu_instr
*alu
)
110 return alu
->op
== nir_op_fdot_replicated2
||
111 alu
->op
== nir_op_fdot_replicated3
||
112 alu
->op
== nir_op_fdot_replicated4
||
113 alu
->op
== nir_op_fdph_replicated
;
116 /* Attempts to coalesce the "move" from the given source of the vec to the
117 * destination of the instruction generating the value. If, for whatever
118 * reason, we cannot coalesce the mmove, it does nothing and returns 0. We
119 * can then call insert_mov as normal.
122 try_coalesce(nir_alu_instr
*vec
, unsigned start_idx
)
124 assert(start_idx
< nir_op_infos
[vec
->op
].num_inputs
);
126 /* We will only even try if the source is SSA */
127 if (!vec
->src
[start_idx
].src
.is_ssa
)
130 assert(vec
->src
[start_idx
].src
.ssa
);
132 /* If we are going to do a reswizzle, then the vecN operation must be the
133 * only use of the source value. We also can't have any source modifiers.
135 nir_foreach_use(src
, vec
->src
[start_idx
].src
.ssa
) {
136 if (src
->parent_instr
!= &vec
->instr
)
139 nir_alu_src
*alu_src
= exec_node_data(nir_alu_src
, src
, src
);
140 if (alu_src
->abs
|| alu_src
->negate
)
144 if (!list_is_empty(&vec
->src
[start_idx
].src
.ssa
->if_uses
))
147 if (vec
->src
[start_idx
].src
.ssa
->parent_instr
->type
!= nir_instr_type_alu
)
150 nir_alu_instr
*src_alu
=
151 nir_instr_as_alu(vec
->src
[start_idx
].src
.ssa
->parent_instr
);
153 if (has_replicated_dest(src_alu
)) {
154 /* The fdot instruction is special: It replicates its result to all
155 * components. This means that we can always rewrite its destination
156 * and we don't need to swizzle anything.
159 /* We only care about being able to re-swizzle the instruction if it is
160 * something that we can reswizzle. It must be per-component. The one
161 * exception to this is the fdotN instructions which implicitly splat
162 * their result out to all channels.
164 if (nir_op_infos
[src_alu
->op
].output_size
!= 0)
167 /* If we are going to reswizzle the instruction, we can't have any
168 * non-per-component sources either.
170 for (unsigned j
= 0; j
< nir_op_infos
[src_alu
->op
].num_inputs
; j
++)
171 if (nir_op_infos
[src_alu
->op
].input_sizes
[j
] != 0)
175 /* Stash off all of the ALU instruction's swizzles. */
176 uint8_t swizzles
[4][4];
177 for (unsigned j
= 0; j
< nir_op_infos
[src_alu
->op
].num_inputs
; j
++)
178 for (unsigned i
= 0; i
< 4; i
++)
179 swizzles
[j
][i
] = src_alu
->src
[j
].swizzle
[i
];
181 unsigned write_mask
= 0;
182 for (unsigned i
= start_idx
; i
< 4; i
++) {
183 if (!(vec
->dest
.write_mask
& (1 << i
)))
186 if (!vec
->src
[i
].src
.is_ssa
||
187 vec
->src
[i
].src
.ssa
!= &src_alu
->dest
.dest
.ssa
)
190 /* At this point, the give vec source matchese up with the ALU
191 * instruction so we can re-swizzle that component to match.
193 write_mask
|= 1 << i
;
194 if (has_replicated_dest(src_alu
)) {
195 /* Since the destination is a single replicated value, we don't need
196 * to do any reswizzling
199 for (unsigned j
= 0; j
< nir_op_infos
[src_alu
->op
].num_inputs
; j
++)
200 src_alu
->src
[j
].swizzle
[i
] = swizzles
[j
][vec
->src
[i
].swizzle
[0]];
203 /* Clear the no longer needed vec source */
204 nir_instr_rewrite_src(&vec
->instr
, &vec
->src
[i
].src
, NIR_SRC_INIT
);
207 nir_instr_rewrite_dest(&src_alu
->instr
, &src_alu
->dest
.dest
, vec
->dest
.dest
);
208 src_alu
->dest
.write_mask
= write_mask
;
214 nir_lower_vec_to_movs_instr(nir_builder
*b
, nir_instr
*instr
, void *data
)
216 if (instr
->type
!= nir_instr_type_alu
)
219 nir_alu_instr
*vec
= nir_instr_as_alu(instr
);
230 bool vec_had_ssa_dest
= vec
->dest
.dest
.is_ssa
;
231 if (vec
->dest
.dest
.is_ssa
) {
232 /* Since we insert multiple MOVs, we have a register destination. */
233 nir_register
*reg
= nir_local_reg_create(b
->impl
);
234 reg
->num_components
= vec
->dest
.dest
.ssa
.num_components
;
235 reg
->bit_size
= vec
->dest
.dest
.ssa
.bit_size
;
237 nir_ssa_def_rewrite_uses(&vec
->dest
.dest
.ssa
, nir_src_for_reg(reg
));
239 nir_instr_rewrite_dest(&vec
->instr
, &vec
->dest
.dest
,
240 nir_dest_for_reg(reg
));
243 unsigned finished_write_mask
= 0;
245 /* First, emit a MOV for all the src channels that are in the
246 * destination reg, in case other values we're populating in the dest
247 * might overwrite them.
249 for (unsigned i
= 0; i
< 4; i
++) {
250 if (!(vec
->dest
.write_mask
& (1 << i
)))
253 if (src_matches_dest_reg(&vec
->dest
.dest
, &vec
->src
[i
].src
)) {
254 finished_write_mask
|= insert_mov(vec
, i
, b
->shader
);
259 /* Now, emit MOVs for all the other src channels. */
260 for (unsigned i
= 0; i
< 4; i
++) {
261 if (!(vec
->dest
.write_mask
& (1 << i
)))
264 /* Coalescing moves the register writes from the vec up to the ALU
265 * instruction in the source. We can only do this if the original
266 * vecN had an SSA destination.
268 if (vec_had_ssa_dest
&& !(finished_write_mask
& (1 << i
)))
269 finished_write_mask
|= try_coalesce(vec
, i
);
271 if (!(finished_write_mask
& (1 << i
)))
272 finished_write_mask
|= insert_mov(vec
, i
, b
->shader
);
275 nir_instr_remove(&vec
->instr
);
282 nir_lower_vec_to_movs(nir_shader
*shader
)
284 return nir_shader_instructions_pass(shader
,
285 nir_lower_vec_to_movs_instr
,
286 nir_metadata_block_index
|
287 nir_metadata_dominance
,