2 * Copyright (C) 2008 Nicolai Haehnle.
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
36 #include "radeon_program_alu.h"
38 #include "shader/prog_parameter.h"
41 static struct prog_instruction
*emit1(struct gl_program
* p
,
42 gl_inst_opcode Opcode
, struct prog_dst_register DstReg
,
43 struct prog_src_register SrcReg
)
45 struct prog_instruction
*fpi
= radeonAppendInstructions(p
, 1);
49 fpi
->SrcReg
[0] = SrcReg
;
53 static struct prog_instruction
*emit2(struct gl_program
* p
,
54 gl_inst_opcode Opcode
, struct prog_dst_register DstReg
,
55 struct prog_src_register SrcReg0
, struct prog_src_register SrcReg1
)
57 struct prog_instruction
*fpi
= radeonAppendInstructions(p
, 1);
61 fpi
->SrcReg
[0] = SrcReg0
;
62 fpi
->SrcReg
[1] = SrcReg1
;
66 static struct prog_instruction
*emit3(struct gl_program
* p
,
67 gl_inst_opcode Opcode
, struct prog_dst_register DstReg
,
68 struct prog_src_register SrcReg0
, struct prog_src_register SrcReg1
,
69 struct prog_src_register SrcReg2
)
71 struct prog_instruction
*fpi
= radeonAppendInstructions(p
, 1);
75 fpi
->SrcReg
[0] = SrcReg0
;
76 fpi
->SrcReg
[1] = SrcReg1
;
77 fpi
->SrcReg
[2] = SrcReg2
;
81 static void set_swizzle(struct prog_src_register
*SrcReg
, int coordinate
, int swz
)
83 SrcReg
->Swizzle
&= ~(7 << (3*coordinate
));
84 SrcReg
->Swizzle
|= swz
<< (3*coordinate
);
87 static void set_negate_base(struct prog_src_register
*SrcReg
, int coordinate
, int negate
)
89 SrcReg
->NegateBase
&= ~(1 << coordinate
);
90 SrcReg
->NegateBase
|= (negate
<< coordinate
);
93 static struct prog_dst_register
dstreg(int file
, int index
)
95 struct prog_dst_register dst
;
98 dst
.WriteMask
= WRITEMASK_XYZW
;
99 dst
.CondMask
= COND_TR
;
100 dst
.CondSwizzle
= SWIZZLE_NOOP
;
106 static struct prog_dst_register
dstregtmpmask(int index
, int mask
)
108 struct prog_dst_register dst
;
109 dst
.File
= PROGRAM_TEMPORARY
;
111 dst
.WriteMask
= mask
;
112 dst
.CondMask
= COND_TR
;
113 dst
.CondSwizzle
= SWIZZLE_NOOP
;
119 static const struct prog_src_register builtin_zero
= {
120 .File
= PROGRAM_BUILTIN
,
122 .Swizzle
= SWIZZLE_0000
124 static const struct prog_src_register builtin_one
= {
125 .File
= PROGRAM_BUILTIN
,
127 .Swizzle
= SWIZZLE_1111
129 static const struct prog_src_register srcreg_undefined
= {
130 .File
= PROGRAM_UNDEFINED
,
132 .Swizzle
= SWIZZLE_NOOP
135 static struct prog_src_register
srcreg(int file
, int index
)
137 struct prog_src_register src
= srcreg_undefined
;
143 static struct prog_src_register
srcregswz(int file
, int index
, int swz
)
145 struct prog_src_register src
= srcreg_undefined
;
152 static struct prog_src_register
negate(struct prog_src_register reg
)
154 struct prog_src_register newreg
= reg
;
155 newreg
.NegateAbs
= !newreg
.NegateAbs
;
159 static struct prog_src_register
swizzle(struct prog_src_register reg
, GLuint x
, GLuint y
, GLuint z
, GLuint w
)
161 struct prog_src_register swizzled
= reg
;
162 swizzled
.Swizzle
= MAKE_SWIZZLE4(
163 x
>= 4 ? x
: GET_SWZ(reg
.Swizzle
, x
),
164 y
>= 4 ? y
: GET_SWZ(reg
.Swizzle
, y
),
165 z
>= 4 ? z
: GET_SWZ(reg
.Swizzle
, z
),
166 w
>= 4 ? w
: GET_SWZ(reg
.Swizzle
, w
));
170 static struct prog_src_register
scalar(struct prog_src_register reg
)
172 return swizzle(reg
, SWIZZLE_X
, SWIZZLE_X
, SWIZZLE_X
, SWIZZLE_X
);
175 static void transform_ABS(struct radeon_transform_context
* t
,
176 struct prog_instruction
* inst
)
178 struct prog_src_register src
= inst
->SrcReg
[0];
182 emit1(t
->Program
, OPCODE_MOV
, inst
->DstReg
, src
);
185 static void transform_DPH(struct radeon_transform_context
* t
,
186 struct prog_instruction
* inst
)
188 struct prog_src_register src0
= inst
->SrcReg
[0];
189 if (src0
.NegateAbs
) {
191 int tempreg
= radeonFindFreeTemporary(t
);
192 emit1(t
->Program
, OPCODE_MOV
, dstreg(PROGRAM_TEMPORARY
, tempreg
), src0
);
193 src0
= srcreg(src0
.File
, src0
.Index
);
196 src0
.NegateBase
^= NEGATE_XYZW
;
199 set_swizzle(&src0
, 3, SWIZZLE_ONE
);
200 set_negate_base(&src0
, 3, 0);
201 emit2(t
->Program
, OPCODE_DP4
, inst
->DstReg
, src0
, inst
->SrcReg
[1]);
204 static void transform_FLR(struct radeon_transform_context
* t
,
205 struct prog_instruction
* inst
)
207 int tempreg
= radeonFindFreeTemporary(t
);
208 emit1(t
->Program
, OPCODE_FRC
, dstreg(PROGRAM_TEMPORARY
, tempreg
), inst
->SrcReg
[0]);
209 emit2(t
->Program
, OPCODE_ADD
, inst
->DstReg
, inst
->SrcReg
[0], negate(srcreg(PROGRAM_TEMPORARY
, tempreg
)));
213 * Definition of LIT (from ARB_fragment_program):
215 * tmp = VectorLoad(op0);
216 * if (tmp.x < 0) tmp.x = 0;
217 * if (tmp.y < 0) tmp.y = 0;
218 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
219 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
222 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
225 * The longest path of computation is the one leading to result.z,
226 * consisting of 5 operations. This implementation of LIT takes
227 * 5 slots, if the subsequent optimization passes are clever enough
228 * to pair instructions correctly.
230 static void transform_LIT(struct radeon_transform_context
* t
,
231 struct prog_instruction
* inst
)
233 static const GLfloat LitConst
[4] = { -127.999999 };
236 GLuint constant_swizzle
;
238 int needTemporary
= 0;
239 struct prog_src_register srctemp
;
241 constant
= _mesa_add_unnamed_constant(t
->Program
->Parameters
, LitConst
, 1, &constant_swizzle
);
243 if (inst
->DstReg
.WriteMask
!= WRITEMASK_XYZW
) {
245 } else if (inst
->DstReg
.File
!= PROGRAM_TEMPORARY
) {
246 // LIT is typically followed by DP3/DP4, so there's no point
247 // in creating special code for this case
252 temp
= radeonFindFreeTemporary(t
);
254 temp
= inst
->DstReg
.Index
;
256 srctemp
= srcreg(PROGRAM_TEMPORARY
, temp
);
258 // tmp.x = max(0.0, Src.x);
259 // tmp.y = max(0.0, Src.y);
260 // tmp.w = clamp(Src.z, -128+eps, 128-eps);
261 emit2(t
->Program
, OPCODE_MAX
,
262 dstregtmpmask(temp
, WRITEMASK_XYW
),
264 swizzle(srcreg(PROGRAM_CONSTANT
, constant
),
265 SWIZZLE_ZERO
, SWIZZLE_ZERO
, SWIZZLE_ZERO
, constant_swizzle
&3));
266 emit2(t
->Program
, OPCODE_MIN
,
267 dstregtmpmask(temp
, WRITEMASK_Z
),
268 swizzle(srctemp
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
),
269 negate(srcregswz(PROGRAM_CONSTANT
, constant
, constant_swizzle
)));
271 // tmp.w = Pow(tmp.y, tmp.w)
272 emit1(t
->Program
, OPCODE_LG2
,
273 dstregtmpmask(temp
, WRITEMASK_W
),
274 swizzle(srctemp
, SWIZZLE_Y
, SWIZZLE_Y
, SWIZZLE_Y
, SWIZZLE_Y
));
275 emit2(t
->Program
, OPCODE_MUL
,
276 dstregtmpmask(temp
, WRITEMASK_W
),
277 swizzle(srctemp
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
),
278 swizzle(srctemp
, SWIZZLE_Z
, SWIZZLE_Z
, SWIZZLE_Z
, SWIZZLE_Z
));
279 emit1(t
->Program
, OPCODE_EX2
,
280 dstregtmpmask(temp
, WRITEMASK_W
),
281 swizzle(srctemp
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
));
283 // tmp.z = (tmp.x > 0) ? tmp.w : 0.0
284 emit3(t
->Program
, OPCODE_CMP
,
285 dstregtmpmask(temp
, WRITEMASK_Z
),
286 negate(swizzle(srctemp
, SWIZZLE_X
, SWIZZLE_X
, SWIZZLE_X
, SWIZZLE_X
)),
287 swizzle(srctemp
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
, SWIZZLE_W
),
290 // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
291 emit1(t
->Program
, OPCODE_MOV
,
292 dstregtmpmask(temp
, WRITEMASK_XYW
),
293 swizzle(srctemp
, SWIZZLE_ONE
, SWIZZLE_X
, SWIZZLE_ONE
, SWIZZLE_ONE
));
296 emit1(t
->Program
, OPCODE_MOV
, inst
->DstReg
, srctemp
);
299 static void transform_POW(struct radeon_transform_context
* t
,
300 struct prog_instruction
* inst
)
302 int tempreg
= radeonFindFreeTemporary(t
);
303 struct prog_dst_register tempdst
= dstreg(PROGRAM_TEMPORARY
, tempreg
);
304 struct prog_src_register tempsrc
= srcreg(PROGRAM_TEMPORARY
, tempreg
);
305 tempdst
.WriteMask
= WRITEMASK_W
;
306 tempsrc
.Swizzle
= SWIZZLE_WWWW
;
308 emit1(t
->Program
, OPCODE_LG2
, tempdst
, scalar(inst
->SrcReg
[0]));
309 emit2(t
->Program
, OPCODE_MUL
, tempdst
, tempsrc
, scalar(inst
->SrcReg
[1]));
310 emit1(t
->Program
, OPCODE_EX2
, inst
->DstReg
, tempsrc
);
313 static void transform_SGE(struct radeon_transform_context
* t
,
314 struct prog_instruction
* inst
)
316 int tempreg
= radeonFindFreeTemporary(t
);
318 emit2(t
->Program
, OPCODE_ADD
, dstreg(PROGRAM_TEMPORARY
, tempreg
), inst
->SrcReg
[0], negate(inst
->SrcReg
[1]));
319 emit3(t
->Program
, OPCODE_CMP
, inst
->DstReg
, srcreg(PROGRAM_TEMPORARY
, tempreg
), builtin_zero
, builtin_one
);
322 static void transform_SLT(struct radeon_transform_context
* t
,
323 struct prog_instruction
* inst
)
325 int tempreg
= radeonFindFreeTemporary(t
);
327 emit2(t
->Program
, OPCODE_ADD
, dstreg(PROGRAM_TEMPORARY
, tempreg
), inst
->SrcReg
[0], negate(inst
->SrcReg
[1]));
328 emit3(t
->Program
, OPCODE_CMP
, inst
->DstReg
, srcreg(PROGRAM_TEMPORARY
, tempreg
), builtin_one
, builtin_zero
);
331 static void transform_SUB(struct radeon_transform_context
* t
,
332 struct prog_instruction
* inst
)
334 emit2(t
->Program
, OPCODE_ADD
, inst
->DstReg
, inst
->SrcReg
[0], negate(inst
->SrcReg
[1]));
337 static void transform_SWZ(struct radeon_transform_context
* t
,
338 struct prog_instruction
* inst
)
340 emit1(t
->Program
, OPCODE_MOV
, inst
->DstReg
, inst
->SrcReg
[0]);
343 static void transform_XPD(struct radeon_transform_context
* t
,
344 struct prog_instruction
* inst
)
346 int tempreg
= radeonFindFreeTemporary(t
);
348 emit2(t
->Program
, OPCODE_MUL
, dstreg(PROGRAM_TEMPORARY
, tempreg
),
349 swizzle(inst
->SrcReg
[0], SWIZZLE_Z
, SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_W
),
350 swizzle(inst
->SrcReg
[1], SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_X
, SWIZZLE_W
));
351 emit3(t
->Program
, OPCODE_MAD
, inst
->DstReg
,
352 swizzle(inst
->SrcReg
[0], SWIZZLE_Y
, SWIZZLE_Z
, SWIZZLE_X
, SWIZZLE_W
),
353 swizzle(inst
->SrcReg
[1], SWIZZLE_Z
, SWIZZLE_X
, SWIZZLE_Y
, SWIZZLE_W
),
354 negate(srcreg(PROGRAM_TEMPORARY
, tempreg
)));
359 * Can be used as a transformation for @ref radeonClauseLocalTransform,
360 * no userData necessary.
362 * Eliminates the following ALU instructions:
363 * ABS, DPH, FLR, LIT, POW, SGE, SLT, SUB, SWZ, XPD
365 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
367 * @note should be applicable to R300 and R500 fragment programs.
369 GLboolean
radeonTransformALU(struct radeon_transform_context
* t
,
370 struct prog_instruction
* inst
,
373 switch(inst
->Opcode
) {
374 case OPCODE_ABS
: transform_ABS(t
, inst
); return GL_TRUE
;
375 case OPCODE_DPH
: transform_DPH(t
, inst
); return GL_TRUE
;
376 case OPCODE_FLR
: transform_FLR(t
, inst
); return GL_TRUE
;
377 case OPCODE_LIT
: transform_LIT(t
, inst
); return GL_TRUE
;
378 case OPCODE_POW
: transform_POW(t
, inst
); return GL_TRUE
;
379 case OPCODE_SGE
: transform_SGE(t
, inst
); return GL_TRUE
;
380 case OPCODE_SLT
: transform_SLT(t
, inst
); return GL_TRUE
;
381 case OPCODE_SUB
: transform_SUB(t
, inst
); return GL_TRUE
;
382 case OPCODE_SWZ
: transform_SWZ(t
, inst
); return GL_TRUE
;
383 case OPCODE_XPD
: transform_XPD(t
, inst
); return GL_TRUE
;