2 * Copyright (C) 2009 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
28 #include "radeon_dataflow.h"
30 #include "radeon_compiler.h"
31 #include "radeon_swizzle.h"
34 static struct rc_src_register
chain_srcregs(struct rc_src_register outer
, struct rc_src_register inner
)
36 struct rc_src_register combine
;
37 combine
.File
= inner
.File
;
38 combine
.Index
= inner
.Index
;
39 combine
.RelAddr
= inner
.RelAddr
;
42 combine
.Negate
= outer
.Negate
;
44 combine
.Abs
= inner
.Abs
;
46 for(unsigned int chan
= 0; chan
< 4; ++chan
) {
47 unsigned int swz
= GET_SWZ(outer
.Swizzle
, chan
);
49 combine
.Negate
|= GET_BIT(inner
.Negate
, swz
) << chan
;
51 combine
.Negate
^= outer
.Negate
;
53 combine
.Swizzle
= combine_swizzles(inner
.Swizzle
, outer
.Swizzle
);
57 struct peephole_state
{
58 struct radeon_compiler
* C
;
59 struct rc_instruction
* Mov
;
60 unsigned int Conflict
:1;
62 /** Whether Mov's source has been clobbered */
63 unsigned int SourceClobbered
:1;
65 /** Which components of Mov's destination register are still from that Mov? */
66 unsigned int MovMask
:4;
68 /** Which components of Mov's destination register are clearly *not* from that Mov */
69 unsigned int DefinedMask
:4;
71 /** Which components of Mov's source register are sourced */
72 unsigned int SourcedMask
:4;
74 /** Branch depth beyond Mov; negative value indicates we left the Mov's block */
79 * This is a callback function that is meant to be passed to
80 * rc_for_all_reads_mask. This function will be called once for each source
82 * @param inst The instruction that the source register belongs to.
83 * @param file The register file of the source register.
84 * @param index The index of the source register.
85 * @param mask The components of the source register that are being read from.
87 static void peephole_scan_read(void * data
, struct rc_instruction
* inst
,
88 rc_register_file file
, unsigned int index
, unsigned int mask
)
90 struct peephole_state
* s
= data
;
92 if (file
!= RC_FILE_TEMPORARY
|| index
!= s
->Mov
->U
.I
.DstReg
.Index
)
95 /* These instructions cannot read from the constants file.
96 * see radeonTransformTEX()
98 if(s
->Mov
->U
.I
.SrcReg
[0].File
!= RC_FILE_TEMPORARY
&&
99 s
->Mov
->U
.I
.SrcReg
[0].File
!= RC_FILE_INPUT
&&
100 (inst
->U
.I
.Opcode
== RC_OPCODE_TEX
||
101 inst
->U
.I
.Opcode
== RC_OPCODE_TXB
||
102 inst
->U
.I
.Opcode
== RC_OPCODE_TXP
||
103 inst
->U
.I
.Opcode
== RC_OPCODE_KIL
)){
107 if ((mask
& s
->MovMask
) == mask
) {
108 if (s
->SourceClobbered
) {
111 } else if ((mask
& s
->DefinedMask
) == mask
) {
112 /* read from something entirely written by other instruction: this is okay */
114 /* read from component combination that is not well-defined without
115 * the MOV: cannot remove it */
120 static void peephole_scan_write(void * data
, struct rc_instruction
* inst
,
121 rc_register_file file
, unsigned int index
, unsigned int mask
)
123 struct peephole_state
* s
= data
;
125 if (s
->BranchDepth
< 0)
128 if (file
== s
->Mov
->U
.I
.DstReg
.File
&& index
== s
->Mov
->U
.I
.DstReg
.Index
) {
130 if (s
->BranchDepth
== 0)
131 s
->DefinedMask
|= mask
;
133 s
->DefinedMask
&= ~mask
;
135 if (file
== s
->Mov
->U
.I
.SrcReg
[0].File
&& index
== s
->Mov
->U
.I
.SrcReg
[0].Index
) {
136 if (mask
& s
->SourcedMask
)
137 s
->SourceClobbered
= 1;
138 } else if (s
->Mov
->U
.I
.SrcReg
[0].RelAddr
&& file
== RC_FILE_ADDRESS
) {
139 s
->SourceClobbered
= 1;
143 static void peephole(struct radeon_compiler
* c
, struct rc_instruction
* inst_mov
)
145 struct peephole_state s
;
147 if (inst_mov
->U
.I
.DstReg
.File
!= RC_FILE_TEMPORARY
|| inst_mov
->U
.I
.WriteALUResult
)
150 memset(&s
, 0, sizeof(s
));
153 s
.MovMask
= inst_mov
->U
.I
.DstReg
.WriteMask
;
154 s
.DefinedMask
= RC_MASK_XYZW
& ~s
.MovMask
;
156 for(unsigned int chan
= 0; chan
< 4; ++chan
) {
157 unsigned int swz
= GET_SWZ(inst_mov
->U
.I
.SrcReg
[0].Swizzle
, chan
);
158 s
.SourcedMask
|= (1 << swz
) & RC_MASK_XYZW
;
161 /* 1st pass: Check whether all subsequent readers can be changed */
162 for(struct rc_instruction
* inst
= inst_mov
->Next
;
163 inst
!= &c
->Program
.Instructions
;
165 /* XXX In the future we might be able to make the optimizer
166 * smart enough to handle loops. */
167 if(inst
->U
.I
.Opcode
== RC_OPCODE_BGNLOOP
168 || inst
->U
.I
.Opcode
== RC_OPCODE_ENDLOOP
){
171 rc_for_all_reads_mask(inst
, peephole_scan_read
, &s
);
172 rc_for_all_writes_mask(inst
, peephole_scan_write
, &s
);
176 if (s
.BranchDepth
>= 0) {
177 if (inst
->U
.I
.Opcode
== RC_OPCODE_IF
) {
179 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_ENDIF
180 || inst
->U
.I
.Opcode
== RC_OPCODE_ELSE
) {
182 if (s
.BranchDepth
< 0) {
183 s
.DefinedMask
&= ~s
.MovMask
;
193 /* 2nd pass: We can satisfy all readers, so switch them over all at once */
194 s
.MovMask
= inst_mov
->U
.I
.DstReg
.WriteMask
;
197 for(struct rc_instruction
* inst
= inst_mov
->Next
;
198 inst
!= &c
->Program
.Instructions
;
200 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
202 for(unsigned int src
= 0; src
< opcode
->NumSrcRegs
; ++src
) {
203 if (inst
->U
.I
.SrcReg
[src
].File
== RC_FILE_TEMPORARY
&&
204 inst
->U
.I
.SrcReg
[src
].Index
== s
.Mov
->U
.I
.DstReg
.Index
) {
205 unsigned int refmask
= 0;
207 for(unsigned int chan
= 0; chan
< 4; ++chan
) {
208 unsigned int swz
= GET_SWZ(inst
->U
.I
.SrcReg
[src
].Swizzle
, chan
);
209 refmask
|= (1 << swz
) & RC_MASK_XYZW
;
212 if ((refmask
& s
.MovMask
) == refmask
)
213 inst
->U
.I
.SrcReg
[src
] = chain_srcregs(inst
->U
.I
.SrcReg
[src
], s
.Mov
->U
.I
.SrcReg
[0]);
217 if (opcode
->HasDstReg
) {
218 if (inst
->U
.I
.DstReg
.File
== RC_FILE_TEMPORARY
&&
219 inst
->U
.I
.DstReg
.Index
== s
.Mov
->U
.I
.DstReg
.Index
) {
220 s
.MovMask
&= ~inst
->U
.I
.DstReg
.WriteMask
;
224 if (s
.BranchDepth
>= 0) {
225 if (inst
->U
.I
.Opcode
== RC_OPCODE_IF
) {
227 } else if (inst
->U
.I
.Opcode
== RC_OPCODE_ENDIF
228 || inst
->U
.I
.Opcode
== RC_OPCODE_ELSE
) {
230 if (s
.BranchDepth
< 0)
231 break; /* no more readers after this point */
236 /* Finally, remove the original MOV instruction */
237 rc_remove_instruction(inst_mov
);
241 * Check if a source register is actually always the same
244 static int is_src_uniform_constant(struct rc_src_register src
,
245 rc_swizzle
* pswz
, unsigned int * pnegate
)
249 if (src
.File
!= RC_FILE_NONE
) {
254 for(unsigned int chan
= 0; chan
< 4; ++chan
) {
255 unsigned int swz
= GET_SWZ(src
.Swizzle
, chan
);
260 if (swz
== RC_SWIZZLE_UNUSED
)
265 *pnegate
= GET_BIT(src
.Negate
, chan
);
268 if (swz
!= *pswz
|| *pnegate
!= GET_BIT(src
.Negate
, chan
)) {
279 static void constant_folding_mad(struct rc_instruction
* inst
)
284 if (is_src_uniform_constant(inst
->U
.I
.SrcReg
[2], &swz
, &negate
)) {
285 if (swz
== RC_SWIZZLE_ZERO
) {
286 inst
->U
.I
.Opcode
= RC_OPCODE_MUL
;
291 if (is_src_uniform_constant(inst
->U
.I
.SrcReg
[1], &swz
, &negate
)) {
292 if (swz
== RC_SWIZZLE_ONE
) {
293 inst
->U
.I
.Opcode
= RC_OPCODE_ADD
;
295 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
296 inst
->U
.I
.SrcReg
[1] = inst
->U
.I
.SrcReg
[2];
298 } else if (swz
== RC_SWIZZLE_ZERO
) {
299 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
300 inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[2];
305 if (is_src_uniform_constant(inst
->U
.I
.SrcReg
[0], &swz
, &negate
)) {
306 if (swz
== RC_SWIZZLE_ONE
) {
307 inst
->U
.I
.Opcode
= RC_OPCODE_ADD
;
309 inst
->U
.I
.SrcReg
[1].Negate
^= RC_MASK_XYZW
;
310 inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[2];
312 } else if (swz
== RC_SWIZZLE_ZERO
) {
313 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
314 inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[2];
320 static void constant_folding_mul(struct rc_instruction
* inst
)
325 if (is_src_uniform_constant(inst
->U
.I
.SrcReg
[0], &swz
, &negate
)) {
326 if (swz
== RC_SWIZZLE_ONE
) {
327 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
328 inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[1];
330 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
332 } else if (swz
== RC_SWIZZLE_ZERO
) {
333 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
334 inst
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_0000
;
339 if (is_src_uniform_constant(inst
->U
.I
.SrcReg
[1], &swz
, &negate
)) {
340 if (swz
== RC_SWIZZLE_ONE
) {
341 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
343 inst
->U
.I
.SrcReg
[0].Negate
^= RC_MASK_XYZW
;
345 } else if (swz
== RC_SWIZZLE_ZERO
) {
346 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
347 inst
->U
.I
.SrcReg
[0].Swizzle
= RC_SWIZZLE_0000
;
353 static void constant_folding_add(struct rc_instruction
* inst
)
358 if (is_src_uniform_constant(inst
->U
.I
.SrcReg
[0], &swz
, &negate
)) {
359 if (swz
== RC_SWIZZLE_ZERO
) {
360 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
361 inst
->U
.I
.SrcReg
[0] = inst
->U
.I
.SrcReg
[1];
366 if (is_src_uniform_constant(inst
->U
.I
.SrcReg
[1], &swz
, &negate
)) {
367 if (swz
== RC_SWIZZLE_ZERO
) {
368 inst
->U
.I
.Opcode
= RC_OPCODE_MOV
;
376 * Replace 0.0, 1.0 and 0.5 immediate constants by their
377 * respective swizzles. Simplify instructions like ADD dst, src, 0;
379 static void constant_folding(struct radeon_compiler
* c
, struct rc_instruction
* inst
)
381 const struct rc_opcode_info
* opcode
= rc_get_opcode_info(inst
->U
.I
.Opcode
);
383 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
384 for(unsigned int src
= 0; src
< opcode
->NumSrcRegs
; ++src
) {
385 if (inst
->U
.I
.SrcReg
[src
].File
!= RC_FILE_CONSTANT
||
386 inst
->U
.I
.SrcReg
[src
].RelAddr
||
387 inst
->U
.I
.SrcReg
[src
].Index
>= c
->Program
.Constants
.Count
)
390 struct rc_constant
* constant
=
391 &c
->Program
.Constants
.Constants
[inst
->U
.I
.SrcReg
[src
].Index
];
393 if (constant
->Type
!= RC_CONSTANT_IMMEDIATE
)
396 struct rc_src_register newsrc
= inst
->U
.I
.SrcReg
[src
];
397 int have_real_reference
= 0;
398 for(unsigned int chan
= 0; chan
< 4; ++chan
) {
399 unsigned int swz
= GET_SWZ(newsrc
.Swizzle
, chan
);
404 float imm
= constant
->u
.Immediate
[swz
];
409 if (baseimm
== 0.0) {
410 newswz
= RC_SWIZZLE_ZERO
;
411 } else if (baseimm
== 1.0) {
412 newswz
= RC_SWIZZLE_ONE
;
413 } else if (baseimm
== 0.5) {
414 newswz
= RC_SWIZZLE_HALF
;
416 have_real_reference
= 1;
420 SET_SWZ(newsrc
.Swizzle
, chan
, newswz
);
421 if (imm
< 0.0 && !newsrc
.Abs
)
422 newsrc
.Negate
^= 1 << chan
;
425 if (!have_real_reference
) {
426 newsrc
.File
= RC_FILE_NONE
;
430 /* don't make the swizzle worse */
431 if (!c
->SwizzleCaps
->IsNative(inst
->U
.I
.Opcode
, newsrc
) &&
432 c
->SwizzleCaps
->IsNative(inst
->U
.I
.Opcode
, inst
->U
.I
.SrcReg
[src
]))
435 inst
->U
.I
.SrcReg
[src
] = newsrc
;
438 /* Simplify instructions based on constants */
439 if (inst
->U
.I
.Opcode
== RC_OPCODE_MAD
)
440 constant_folding_mad(inst
);
442 /* note: MAD can simplify to MUL or ADD */
443 if (inst
->U
.I
.Opcode
== RC_OPCODE_MUL
)
444 constant_folding_mul(inst
);
445 else if (inst
->U
.I
.Opcode
== RC_OPCODE_ADD
)
446 constant_folding_add(inst
);
449 void rc_optimize(struct radeon_compiler
* c
)
451 struct rc_instruction
* inst
= c
->Program
.Instructions
.Next
;
452 while(inst
!= &c
->Program
.Instructions
) {
453 struct rc_instruction
* cur
= inst
;
456 constant_folding(c
, cur
);
458 if (cur
->U
.I
.Opcode
== RC_OPCODE_MOV
) {
460 /* cur may no longer be part of the program */