85ea810523f4381cea521fb309fed6a4027ee038
[mesa.git] / src / mesa / drivers / dri / r300 / radeon_program_alu.c
1 /*
2 * Copyright (C) 2008 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 /**
29 * @file
30 *
31 * Shareable transformations that transform "special" ALU instructions
32 * into ALU instructions that are supported by hardware.
33 *
34 */
35
36 #include "radeon_program_alu.h"
37
38 #include "shader/prog_parameter.h"
39
40
41 static struct prog_instruction *emit1(struct gl_program* p,
42 gl_inst_opcode Opcode, struct prog_dst_register DstReg,
43 struct prog_src_register SrcReg)
44 {
45 struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
46
47 fpi->Opcode = Opcode;
48 fpi->DstReg = DstReg;
49 fpi->SrcReg[0] = SrcReg;
50 return fpi;
51 }
52
53 static struct prog_instruction *emit2(struct gl_program* p,
54 gl_inst_opcode Opcode, struct prog_dst_register DstReg,
55 struct prog_src_register SrcReg0, struct prog_src_register SrcReg1)
56 {
57 struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
58
59 fpi->Opcode = Opcode;
60 fpi->DstReg = DstReg;
61 fpi->SrcReg[0] = SrcReg0;
62 fpi->SrcReg[1] = SrcReg1;
63 return fpi;
64 }
65
66 static struct prog_instruction *emit3(struct gl_program* p,
67 gl_inst_opcode Opcode, struct prog_dst_register DstReg,
68 struct prog_src_register SrcReg0, struct prog_src_register SrcReg1,
69 struct prog_src_register SrcReg2)
70 {
71 struct prog_instruction *fpi = radeonAppendInstructions(p, 1);
72
73 fpi->Opcode = Opcode;
74 fpi->DstReg = DstReg;
75 fpi->SrcReg[0] = SrcReg0;
76 fpi->SrcReg[1] = SrcReg1;
77 fpi->SrcReg[2] = SrcReg2;
78 return fpi;
79 }
80
81 static void set_swizzle(struct prog_src_register *SrcReg, int coordinate, int swz)
82 {
83 SrcReg->Swizzle &= ~(7 << (3*coordinate));
84 SrcReg->Swizzle |= swz << (3*coordinate);
85 }
86
87 static void set_negate_base(struct prog_src_register *SrcReg, int coordinate, int negate)
88 {
89 SrcReg->NegateBase &= ~(1 << coordinate);
90 SrcReg->NegateBase |= (negate << coordinate);
91 }
92
93 static struct prog_dst_register dstreg(int file, int index)
94 {
95 struct prog_dst_register dst;
96 dst.File = file;
97 dst.Index = index;
98 dst.WriteMask = WRITEMASK_XYZW;
99 dst.CondMask = COND_TR;
100 dst.CondSwizzle = SWIZZLE_NOOP;
101 dst.CondSrc = 0;
102 dst.pad = 0;
103 return dst;
104 }
105
106 static struct prog_dst_register dstregtmpmask(int index, int mask)
107 {
108 struct prog_dst_register dst;
109 dst.File = PROGRAM_TEMPORARY;
110 dst.Index = index;
111 dst.WriteMask = mask;
112 dst.CondMask = COND_TR;
113 dst.CondSwizzle = SWIZZLE_NOOP;
114 dst.CondSrc = 0;
115 dst.pad = 0;
116 return dst;
117 }
118
119 static const struct prog_src_register builtin_zero = {
120 .File = PROGRAM_BUILTIN,
121 .Index = 0,
122 .Swizzle = SWIZZLE_0000
123 };
124 static const struct prog_src_register builtin_one = {
125 .File = PROGRAM_BUILTIN,
126 .Index = 0,
127 .Swizzle = SWIZZLE_1111
128 };
129 static const struct prog_src_register srcreg_undefined = {
130 .File = PROGRAM_UNDEFINED,
131 .Index = 0,
132 .Swizzle = SWIZZLE_NOOP
133 };
134
135 static struct prog_src_register srcreg(int file, int index)
136 {
137 struct prog_src_register src = srcreg_undefined;
138 src.File = file;
139 src.Index = index;
140 return src;
141 }
142
143 static struct prog_src_register srcregswz(int file, int index, int swz)
144 {
145 struct prog_src_register src = srcreg_undefined;
146 src.File = file;
147 src.Index = index;
148 src.Swizzle = swz;
149 return src;
150 }
151
152 static struct prog_src_register negate(struct prog_src_register reg)
153 {
154 struct prog_src_register newreg = reg;
155 newreg.NegateAbs = !newreg.NegateAbs;
156 return newreg;
157 }
158
159 static struct prog_src_register swizzle(struct prog_src_register reg, GLuint x, GLuint y, GLuint z, GLuint w)
160 {
161 struct prog_src_register swizzled = reg;
162 swizzled.Swizzle = MAKE_SWIZZLE4(
163 x >= 4 ? x : GET_SWZ(reg.Swizzle, x),
164 y >= 4 ? y : GET_SWZ(reg.Swizzle, y),
165 z >= 4 ? z : GET_SWZ(reg.Swizzle, z),
166 w >= 4 ? w : GET_SWZ(reg.Swizzle, w));
167 return swizzled;
168 }
169
170 static struct prog_src_register scalar(struct prog_src_register reg)
171 {
172 return swizzle(reg, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X);
173 }
174
175 static void transform_ABS(struct radeon_transform_context* t,
176 struct prog_instruction* inst)
177 {
178 struct prog_src_register src = inst->SrcReg[0];
179 src.Abs = 1;
180 src.NegateBase = 0;
181 src.NegateAbs = 0;
182 emit1(t->Program, OPCODE_MOV, inst->DstReg, src);
183 }
184
185 static void transform_DPH(struct radeon_transform_context* t,
186 struct prog_instruction* inst)
187 {
188 struct prog_src_register src0 = inst->SrcReg[0];
189 if (src0.NegateAbs) {
190 if (src0.Abs) {
191 int tempreg = radeonFindFreeTemporary(t);
192 emit1(t->Program, OPCODE_MOV, dstreg(PROGRAM_TEMPORARY, tempreg), src0);
193 src0 = srcreg(src0.File, src0.Index);
194 } else {
195 src0.NegateAbs = 0;
196 src0.NegateBase ^= NEGATE_XYZW;
197 }
198 }
199 set_swizzle(&src0, 3, SWIZZLE_ONE);
200 set_negate_base(&src0, 3, 0);
201 emit2(t->Program, OPCODE_DP4, inst->DstReg, src0, inst->SrcReg[1]);
202 }
203
204 static void transform_FLR(struct radeon_transform_context* t,
205 struct prog_instruction* inst)
206 {
207 int tempreg = radeonFindFreeTemporary(t);
208 emit1(t->Program, OPCODE_FRC, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0]);
209 emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
210 }
211
212 /**
213 * Definition of LIT (from ARB_fragment_program):
214 *
215 * tmp = VectorLoad(op0);
216 * if (tmp.x < 0) tmp.x = 0;
217 * if (tmp.y < 0) tmp.y = 0;
218 * if (tmp.w < -(128.0-epsilon)) tmp.w = -(128.0-epsilon);
219 * else if (tmp.w > 128-epsilon) tmp.w = 128-epsilon;
220 * result.x = 1.0;
221 * result.y = tmp.x;
222 * result.z = (tmp.x > 0) ? RoughApproxPower(tmp.y, tmp.w) : 0.0;
223 * result.w = 1.0;
224 *
225 * The longest path of computation is the one leading to result.z,
226 * consisting of 5 operations. This implementation of LIT takes
227 * 5 slots, if the subsequent optimization passes are clever enough
228 * to pair instructions correctly.
229 */
230 static void transform_LIT(struct radeon_transform_context* t,
231 struct prog_instruction* inst)
232 {
233 static const GLfloat LitConst[4] = { -127.999999 };
234
235 GLuint constant;
236 GLuint constant_swizzle;
237 GLuint temp;
238 int needTemporary = 0;
239 struct prog_src_register srctemp;
240
241 constant = _mesa_add_unnamed_constant(t->Program->Parameters, LitConst, 1, &constant_swizzle);
242
243 if (inst->DstReg.WriteMask != WRITEMASK_XYZW) {
244 needTemporary = 1;
245 } else if (inst->DstReg.File != PROGRAM_TEMPORARY) {
246 // LIT is typically followed by DP3/DP4, so there's no point
247 // in creating special code for this case
248 needTemporary = 1;
249 }
250
251 if (needTemporary) {
252 temp = radeonFindFreeTemporary(t);
253 } else {
254 temp = inst->DstReg.Index;
255 }
256 srctemp = srcreg(PROGRAM_TEMPORARY, temp);
257
258 // tmp.x = max(0.0, Src.x);
259 // tmp.y = max(0.0, Src.y);
260 // tmp.w = clamp(Src.z, -128+eps, 128-eps);
261 emit2(t->Program, OPCODE_MAX,
262 dstregtmpmask(temp, WRITEMASK_XYW),
263 inst->SrcReg[0],
264 swizzle(srcreg(PROGRAM_CONSTANT, constant),
265 SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, constant_swizzle&3));
266 emit2(t->Program, OPCODE_MIN,
267 dstregtmpmask(temp, WRITEMASK_Z),
268 swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
269 negate(srcregswz(PROGRAM_CONSTANT, constant, constant_swizzle)));
270
271 // tmp.w = Pow(tmp.y, tmp.w)
272 emit1(t->Program, OPCODE_LG2,
273 dstregtmpmask(temp, WRITEMASK_W),
274 swizzle(srctemp, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y));
275 emit2(t->Program, OPCODE_MUL,
276 dstregtmpmask(temp, WRITEMASK_W),
277 swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
278 swizzle(srctemp, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z, SWIZZLE_Z));
279 emit1(t->Program, OPCODE_EX2,
280 dstregtmpmask(temp, WRITEMASK_W),
281 swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W));
282
283 // tmp.z = (tmp.x > 0) ? tmp.w : 0.0
284 emit3(t->Program, OPCODE_CMP,
285 dstregtmpmask(temp, WRITEMASK_Z),
286 negate(swizzle(srctemp, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X)),
287 swizzle(srctemp, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W),
288 builtin_zero);
289
290 // tmp.x, tmp.y, tmp.w = 1.0, tmp.x, 1.0
291 emit1(t->Program, OPCODE_MOV,
292 dstregtmpmask(temp, WRITEMASK_XYW),
293 swizzle(srctemp, SWIZZLE_ONE, SWIZZLE_X, SWIZZLE_ONE, SWIZZLE_ONE));
294
295 if (needTemporary)
296 emit1(t->Program, OPCODE_MOV, inst->DstReg, srctemp);
297 }
298
299 static void transform_POW(struct radeon_transform_context* t,
300 struct prog_instruction* inst)
301 {
302 int tempreg = radeonFindFreeTemporary(t);
303 struct prog_dst_register tempdst = dstreg(PROGRAM_TEMPORARY, tempreg);
304 struct prog_src_register tempsrc = srcreg(PROGRAM_TEMPORARY, tempreg);
305 tempdst.WriteMask = WRITEMASK_W;
306 tempsrc.Swizzle = SWIZZLE_WWWW;
307
308 emit1(t->Program, OPCODE_LG2, tempdst, scalar(inst->SrcReg[0]));
309 emit2(t->Program, OPCODE_MUL, tempdst, tempsrc, scalar(inst->SrcReg[1]));
310 emit1(t->Program, OPCODE_EX2, inst->DstReg, tempsrc);
311 }
312
313 static void transform_SGE(struct radeon_transform_context* t,
314 struct prog_instruction* inst)
315 {
316 int tempreg = radeonFindFreeTemporary(t);
317
318 emit2(t->Program, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
319 emit3(t->Program, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_zero, builtin_one);
320 }
321
322 static void transform_SLT(struct radeon_transform_context* t,
323 struct prog_instruction* inst)
324 {
325 int tempreg = radeonFindFreeTemporary(t);
326
327 emit2(t->Program, OPCODE_ADD, dstreg(PROGRAM_TEMPORARY, tempreg), inst->SrcReg[0], negate(inst->SrcReg[1]));
328 emit3(t->Program, OPCODE_CMP, inst->DstReg, srcreg(PROGRAM_TEMPORARY, tempreg), builtin_one, builtin_zero);
329 }
330
331 static void transform_SUB(struct radeon_transform_context* t,
332 struct prog_instruction* inst)
333 {
334 emit2(t->Program, OPCODE_ADD, inst->DstReg, inst->SrcReg[0], negate(inst->SrcReg[1]));
335 }
336
337 static void transform_SWZ(struct radeon_transform_context* t,
338 struct prog_instruction* inst)
339 {
340 emit1(t->Program, OPCODE_MOV, inst->DstReg, inst->SrcReg[0]);
341 }
342
343 static void transform_XPD(struct radeon_transform_context* t,
344 struct prog_instruction* inst)
345 {
346 int tempreg = radeonFindFreeTemporary(t);
347
348 emit2(t->Program, OPCODE_MUL, dstreg(PROGRAM_TEMPORARY, tempreg),
349 swizzle(inst->SrcReg[0], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
350 swizzle(inst->SrcReg[1], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W));
351 emit3(t->Program, OPCODE_MAD, inst->DstReg,
352 swizzle(inst->SrcReg[0], SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W),
353 swizzle(inst->SrcReg[1], SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W),
354 negate(srcreg(PROGRAM_TEMPORARY, tempreg)));
355 }
356
357
358 /**
359 * Can be used as a transformation for @ref radeonClauseLocalTransform,
360 * no userData necessary.
361 *
362 * Eliminates the following ALU instructions:
363 * ABS, DPH, FLR, LIT, POW, SGE, SLT, SUB, SWZ, XPD
364 * using:
365 * MOV, ADD, MUL, MAD, FRC, DP3, LG2, EX2, CMP
366 *
367 * @note should be applicable to R300 and R500 fragment programs.
368 */
369 GLboolean radeonTransformALU(struct radeon_transform_context* t,
370 struct prog_instruction* inst,
371 void* unused)
372 {
373 switch(inst->Opcode) {
374 case OPCODE_ABS: transform_ABS(t, inst); return GL_TRUE;
375 case OPCODE_DPH: transform_DPH(t, inst); return GL_TRUE;
376 case OPCODE_FLR: transform_FLR(t, inst); return GL_TRUE;
377 case OPCODE_LIT: transform_LIT(t, inst); return GL_TRUE;
378 case OPCODE_POW: transform_POW(t, inst); return GL_TRUE;
379 case OPCODE_SGE: transform_SGE(t, inst); return GL_TRUE;
380 case OPCODE_SLT: transform_SLT(t, inst); return GL_TRUE;
381 case OPCODE_SUB: transform_SUB(t, inst); return GL_TRUE;
382 case OPCODE_SWZ: transform_SWZ(t, inst); return GL_TRUE;
383 case OPCODE_XPD: transform_XPD(t, inst); return GL_TRUE;
384 default:
385 return GL_FALSE;
386 }
387 }