r300/compiler: r500 hw support for break and continue in loops.
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_optimize.c
1 /*
2 * Copyright (C) 2009 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 #include "radeon_dataflow.h"
29
30 #include "radeon_compiler.h"
31 #include "radeon_swizzle.h"
32
33
34 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
35 {
36 struct rc_src_register combine;
37 combine.File = inner.File;
38 combine.Index = inner.Index;
39 combine.RelAddr = inner.RelAddr;
40 if (outer.Abs) {
41 combine.Abs = 1;
42 combine.Negate = outer.Negate;
43 } else {
44 combine.Abs = inner.Abs;
45 combine.Negate = 0;
46 for(unsigned int chan = 0; chan < 4; ++chan) {
47 unsigned int swz = GET_SWZ(outer.Swizzle, chan);
48 if (swz < 4)
49 combine.Negate |= GET_BIT(inner.Negate, swz) << chan;
50 }
51 combine.Negate ^= outer.Negate;
52 }
53 combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
54 return combine;
55 }
56
57 struct peephole_state {
58 struct radeon_compiler * C;
59 struct rc_instruction * Mov;
60 unsigned int Conflict:1;
61
62 /** Whether Mov's source has been clobbered */
63 unsigned int SourceClobbered:1;
64
65 /** Which components of Mov's destination register are still from that Mov? */
66 unsigned int MovMask:4;
67
68 /** Which components of Mov's destination register are clearly *not* from that Mov */
69 unsigned int DefinedMask:4;
70
71 /** Which components of Mov's source register are sourced */
72 unsigned int SourcedMask:4;
73
74 /** Branch depth beyond Mov; negative value indicates we left the Mov's block */
75 int BranchDepth;
76 };
77
78 /**
79 * This is a callback function that is meant to be passed to
80 * rc_for_all_reads_mask. This function will be called once for each source
81 * register in inst.
82 * @param inst The instruction that the source register belongs to.
83 * @param file The register file of the source register.
84 * @param index The index of the source register.
85 * @param mask The components of the source register that are being read from.
86 */
87 static void peephole_scan_read(void * data, struct rc_instruction * inst,
88 rc_register_file file, unsigned int index, unsigned int mask)
89 {
90 struct peephole_state * s = data;
91
92 if (file != RC_FILE_TEMPORARY || index != s->Mov->U.I.DstReg.Index)
93 return;
94
95 /* These instructions cannot read from the constants file.
96 * see radeonTransformTEX()
97 */
98 if(s->Mov->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
99 s->Mov->U.I.SrcReg[0].File != RC_FILE_INPUT &&
100 (inst->U.I.Opcode == RC_OPCODE_TEX ||
101 inst->U.I.Opcode == RC_OPCODE_TXB ||
102 inst->U.I.Opcode == RC_OPCODE_TXP ||
103 inst->U.I.Opcode == RC_OPCODE_KIL)){
104 s->Conflict = 1;
105 return;
106 }
107 if ((mask & s->MovMask) == mask) {
108 if (s->SourceClobbered) {
109 s->Conflict = 1;
110 }
111 } else if ((mask & s->DefinedMask) == mask) {
112 /* read from something entirely written by other instruction: this is okay */
113 } else {
114 /* read from component combination that is not well-defined without
115 * the MOV: cannot remove it */
116 s->Conflict = 1;
117 }
118 }
119
120 static void peephole_scan_write(void * data, struct rc_instruction * inst,
121 rc_register_file file, unsigned int index, unsigned int mask)
122 {
123 struct peephole_state * s = data;
124
125 if (s->BranchDepth < 0)
126 return;
127
128 if (file == s->Mov->U.I.DstReg.File && index == s->Mov->U.I.DstReg.Index) {
129 s->MovMask &= ~mask;
130 if (s->BranchDepth == 0)
131 s->DefinedMask |= mask;
132 else
133 s->DefinedMask &= ~mask;
134 }
135 if (file == s->Mov->U.I.SrcReg[0].File && index == s->Mov->U.I.SrcReg[0].Index) {
136 if (mask & s->SourcedMask)
137 s->SourceClobbered = 1;
138 } else if (s->Mov->U.I.SrcReg[0].RelAddr && file == RC_FILE_ADDRESS) {
139 s->SourceClobbered = 1;
140 }
141 }
142
143 static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mov)
144 {
145 struct peephole_state s;
146
147 if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY || inst_mov->U.I.WriteALUResult)
148 return;
149
150 memset(&s, 0, sizeof(s));
151 s.C = c;
152 s.Mov = inst_mov;
153 s.MovMask = inst_mov->U.I.DstReg.WriteMask;
154 s.DefinedMask = RC_MASK_XYZW & ~s.MovMask;
155
156 for(unsigned int chan = 0; chan < 4; ++chan) {
157 unsigned int swz = GET_SWZ(inst_mov->U.I.SrcReg[0].Swizzle, chan);
158 s.SourcedMask |= (1 << swz) & RC_MASK_XYZW;
159 }
160
161 /* 1st pass: Check whether all subsequent readers can be changed */
162 for(struct rc_instruction * inst = inst_mov->Next;
163 inst != &c->Program.Instructions;
164 inst = inst->Next) {
165 /* XXX In the future we might be able to make the optimizer
166 * smart enough to handle loops. */
167 if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP
168 || inst->U.I.Opcode == RC_OPCODE_ENDLOOP){
169 return;
170 }
171 rc_for_all_reads_mask(inst, peephole_scan_read, &s);
172 rc_for_all_writes_mask(inst, peephole_scan_write, &s);
173 if (s.Conflict)
174 return;
175
176 if (s.BranchDepth >= 0) {
177 if (inst->U.I.Opcode == RC_OPCODE_IF) {
178 s.BranchDepth++;
179 } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
180 || inst->U.I.Opcode == RC_OPCODE_ELSE) {
181 s.BranchDepth--;
182 if (s.BranchDepth < 0) {
183 s.DefinedMask &= ~s.MovMask;
184 s.MovMask = 0;
185 }
186 }
187 }
188 }
189
190 if (s.Conflict)
191 return;
192
193 /* 2nd pass: We can satisfy all readers, so switch them over all at once */
194 s.MovMask = inst_mov->U.I.DstReg.WriteMask;
195 s.BranchDepth = 0;
196
197 for(struct rc_instruction * inst = inst_mov->Next;
198 inst != &c->Program.Instructions;
199 inst = inst->Next) {
200 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
201
202 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
203 if (inst->U.I.SrcReg[src].File == RC_FILE_TEMPORARY &&
204 inst->U.I.SrcReg[src].Index == s.Mov->U.I.DstReg.Index) {
205 unsigned int refmask = 0;
206
207 for(unsigned int chan = 0; chan < 4; ++chan) {
208 unsigned int swz = GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan);
209 refmask |= (1 << swz) & RC_MASK_XYZW;
210 }
211
212 if ((refmask & s.MovMask) == refmask)
213 inst->U.I.SrcReg[src] = chain_srcregs(inst->U.I.SrcReg[src], s.Mov->U.I.SrcReg[0]);
214 }
215 }
216
217 if (opcode->HasDstReg) {
218 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY &&
219 inst->U.I.DstReg.Index == s.Mov->U.I.DstReg.Index) {
220 s.MovMask &= ~inst->U.I.DstReg.WriteMask;
221 }
222 }
223
224 if (s.BranchDepth >= 0) {
225 if (inst->U.I.Opcode == RC_OPCODE_IF) {
226 s.BranchDepth++;
227 } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
228 || inst->U.I.Opcode == RC_OPCODE_ELSE) {
229 s.BranchDepth--;
230 if (s.BranchDepth < 0)
231 break; /* no more readers after this point */
232 }
233 }
234 }
235
236 /* Finally, remove the original MOV instruction */
237 rc_remove_instruction(inst_mov);
238 }
239
240 /**
241 * Check if a source register is actually always the same
242 * swizzle constant.
243 */
244 static int is_src_uniform_constant(struct rc_src_register src,
245 rc_swizzle * pswz, unsigned int * pnegate)
246 {
247 int have_used = 0;
248
249 if (src.File != RC_FILE_NONE) {
250 *pswz = 0;
251 return 0;
252 }
253
254 for(unsigned int chan = 0; chan < 4; ++chan) {
255 unsigned int swz = GET_SWZ(src.Swizzle, chan);
256 if (swz < 4) {
257 *pswz = 0;
258 return 0;
259 }
260 if (swz == RC_SWIZZLE_UNUSED)
261 continue;
262
263 if (!have_used) {
264 *pswz = swz;
265 *pnegate = GET_BIT(src.Negate, chan);
266 have_used = 1;
267 } else {
268 if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
269 *pswz = 0;
270 return 0;
271 }
272 }
273 }
274
275 return 1;
276 }
277
278
279 static void constant_folding_mad(struct rc_instruction * inst)
280 {
281 rc_swizzle swz;
282 unsigned int negate;
283
284 if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
285 if (swz == RC_SWIZZLE_ZERO) {
286 inst->U.I.Opcode = RC_OPCODE_MUL;
287 return;
288 }
289 }
290
291 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
292 if (swz == RC_SWIZZLE_ONE) {
293 inst->U.I.Opcode = RC_OPCODE_ADD;
294 if (negate)
295 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
296 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2];
297 return;
298 } else if (swz == RC_SWIZZLE_ZERO) {
299 inst->U.I.Opcode = RC_OPCODE_MOV;
300 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
301 return;
302 }
303 }
304
305 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
306 if (swz == RC_SWIZZLE_ONE) {
307 inst->U.I.Opcode = RC_OPCODE_ADD;
308 if (negate)
309 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
310 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
311 return;
312 } else if (swz == RC_SWIZZLE_ZERO) {
313 inst->U.I.Opcode = RC_OPCODE_MOV;
314 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
315 return;
316 }
317 }
318 }
319
320 static void constant_folding_mul(struct rc_instruction * inst)
321 {
322 rc_swizzle swz;
323 unsigned int negate;
324
325 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
326 if (swz == RC_SWIZZLE_ONE) {
327 inst->U.I.Opcode = RC_OPCODE_MOV;
328 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
329 if (negate)
330 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
331 return;
332 } else if (swz == RC_SWIZZLE_ZERO) {
333 inst->U.I.Opcode = RC_OPCODE_MOV;
334 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
335 return;
336 }
337 }
338
339 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
340 if (swz == RC_SWIZZLE_ONE) {
341 inst->U.I.Opcode = RC_OPCODE_MOV;
342 if (negate)
343 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
344 return;
345 } else if (swz == RC_SWIZZLE_ZERO) {
346 inst->U.I.Opcode = RC_OPCODE_MOV;
347 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
348 return;
349 }
350 }
351 }
352
353 static void constant_folding_add(struct rc_instruction * inst)
354 {
355 rc_swizzle swz;
356 unsigned int negate;
357
358 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
359 if (swz == RC_SWIZZLE_ZERO) {
360 inst->U.I.Opcode = RC_OPCODE_MOV;
361 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
362 return;
363 }
364 }
365
366 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
367 if (swz == RC_SWIZZLE_ZERO) {
368 inst->U.I.Opcode = RC_OPCODE_MOV;
369 return;
370 }
371 }
372 }
373
374
375 /**
376 * Replace 0.0, 1.0 and 0.5 immediate constants by their
377 * respective swizzles. Simplify instructions like ADD dst, src, 0;
378 */
379 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
380 {
381 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
382
383 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
384 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
385 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
386 inst->U.I.SrcReg[src].RelAddr ||
387 inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
388 continue;
389
390 struct rc_constant * constant =
391 &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
392
393 if (constant->Type != RC_CONSTANT_IMMEDIATE)
394 continue;
395
396 struct rc_src_register newsrc = inst->U.I.SrcReg[src];
397 int have_real_reference = 0;
398 for(unsigned int chan = 0; chan < 4; ++chan) {
399 unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
400 if (swz >= 4)
401 continue;
402
403 unsigned int newswz;
404 float imm = constant->u.Immediate[swz];
405 float baseimm = imm;
406 if (imm < 0.0)
407 baseimm = -baseimm;
408
409 if (baseimm == 0.0) {
410 newswz = RC_SWIZZLE_ZERO;
411 } else if (baseimm == 1.0) {
412 newswz = RC_SWIZZLE_ONE;
413 } else if (baseimm == 0.5) {
414 newswz = RC_SWIZZLE_HALF;
415 } else {
416 have_real_reference = 1;
417 continue;
418 }
419
420 SET_SWZ(newsrc.Swizzle, chan, newswz);
421 if (imm < 0.0 && !newsrc.Abs)
422 newsrc.Negate ^= 1 << chan;
423 }
424
425 if (!have_real_reference) {
426 newsrc.File = RC_FILE_NONE;
427 newsrc.Index = 0;
428 }
429
430 /* don't make the swizzle worse */
431 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc) &&
432 c->SwizzleCaps->IsNative(inst->U.I.Opcode, inst->U.I.SrcReg[src]))
433 continue;
434
435 inst->U.I.SrcReg[src] = newsrc;
436 }
437
438 /* Simplify instructions based on constants */
439 if (inst->U.I.Opcode == RC_OPCODE_MAD)
440 constant_folding_mad(inst);
441
442 /* note: MAD can simplify to MUL or ADD */
443 if (inst->U.I.Opcode == RC_OPCODE_MUL)
444 constant_folding_mul(inst);
445 else if (inst->U.I.Opcode == RC_OPCODE_ADD)
446 constant_folding_add(inst);
447 }
448
449 void rc_optimize(struct radeon_compiler * c)
450 {
451 struct rc_instruction * inst = c->Program.Instructions.Next;
452 while(inst != &c->Program.Instructions) {
453 struct rc_instruction * cur = inst;
454 inst = inst->Next;
455
456 constant_folding(c, cur);
457
458 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
459 peephole(c, cur);
460 /* cur may no longer be part of the program */
461 }
462 }
463 }