r300/compiler: fix handling of indexed temporaries in peephole
[mesa.git] / src / mesa / drivers / dri / r300 / compiler / radeon_optimize.c
1 /*
2 * Copyright (C) 2009 Nicolai Haehnle.
3 *
4 * All Rights Reserved.
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining
7 * a copy of this software and associated documentation files (the
8 * "Software"), to deal in the Software without restriction, including
9 * without limitation the rights to use, copy, modify, merge, publish,
10 * distribute, sublicense, and/or sell copies of the Software, and to
11 * permit persons to whom the Software is furnished to do so, subject to
12 * the following conditions:
13 *
14 * The above copyright notice and this permission notice (including the
15 * next paragraph) shall be included in all copies or substantial
16 * portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25 *
26 */
27
28 #include "radeon_dataflow.h"
29
30 #include "radeon_compiler.h"
31 #include "radeon_swizzle.h"
32
33
34 static struct rc_src_register chain_srcregs(struct rc_src_register outer, struct rc_src_register inner)
35 {
36 struct rc_src_register combine;
37 combine.File = inner.File;
38 combine.Index = inner.Index;
39 combine.RelAddr = inner.RelAddr;
40 if (outer.Abs) {
41 combine.Abs = 1;
42 combine.Negate = outer.Negate;
43 } else {
44 combine.Abs = inner.Abs;
45 combine.Negate = 0;
46 for(unsigned int chan = 0; chan < 4; ++chan) {
47 unsigned int swz = GET_SWZ(outer.Swizzle, chan);
48 if (swz < 4)
49 combine.Negate |= GET_BIT(inner.Negate, swz) << chan;
50 }
51 combine.Negate ^= outer.Negate;
52 }
53 combine.Swizzle = combine_swizzles(inner.Swizzle, outer.Swizzle);
54 return combine;
55 }
56
57 struct peephole_state {
58 struct radeon_compiler * C;
59 struct rc_instruction * Mov;
60 unsigned int Conflict:1;
61
62 /** Whether Mov's source has been clobbered */
63 unsigned int SourceClobbered:1;
64
65 /** Which components of Mov's destination register are still from that Mov? */
66 unsigned int MovMask:4;
67
68 /** Which components of Mov's destination register are clearly *not* from that Mov */
69 unsigned int DefinedMask:4;
70
71 /** Which components of Mov's source register are sourced */
72 unsigned int SourcedMask:4;
73
74 /** Branch depth beyond Mov; negative value indicates we left the Mov's block */
75 int BranchDepth;
76 };
77
78 /**
79 * This is a callback function that is meant to be passed to
80 * rc_for_all_reads_mask. This function will be called once for each source
81 * register in inst.
82 * @param inst The instruction that the source register belongs to.
83 * @param file The register file of the source register.
84 * @param index The index of the source register.
85 * @param mask The components of the source register that are being read from.
86 */
87 static void peephole_scan_read(void * data, struct rc_instruction * inst,
88 rc_register_file file, unsigned int index, unsigned int mask)
89 {
90 struct peephole_state * s = data;
91
92 /* XXX This could probably be handled better. */
93 if (file == RC_FILE_ADDRESS) {
94 s->Conflict = 1;
95 return;
96 }
97
98 if (file != RC_FILE_TEMPORARY || index != s->Mov->U.I.DstReg.Index)
99 return;
100
101 /* These instructions cannot read from the constants file.
102 * see radeonTransformTEX()
103 */
104 if(s->Mov->U.I.SrcReg[0].File != RC_FILE_TEMPORARY &&
105 s->Mov->U.I.SrcReg[0].File != RC_FILE_INPUT &&
106 (inst->U.I.Opcode == RC_OPCODE_TEX ||
107 inst->U.I.Opcode == RC_OPCODE_TXB ||
108 inst->U.I.Opcode == RC_OPCODE_TXP ||
109 inst->U.I.Opcode == RC_OPCODE_KIL)){
110 s->Conflict = 1;
111 return;
112 }
113 if ((mask & s->MovMask) == mask) {
114 if (s->SourceClobbered) {
115 s->Conflict = 1;
116 }
117 } else if ((mask & s->DefinedMask) == mask) {
118 /* read from something entirely written by other instruction: this is okay */
119 } else {
120 /* read from component combination that is not well-defined without
121 * the MOV: cannot remove it */
122 s->Conflict = 1;
123 }
124 }
125
126 static void peephole_scan_write(void * data, struct rc_instruction * inst,
127 rc_register_file file, unsigned int index, unsigned int mask)
128 {
129 struct peephole_state * s = data;
130
131 if (s->BranchDepth < 0)
132 return;
133
134 if (file == s->Mov->U.I.DstReg.File && index == s->Mov->U.I.DstReg.Index) {
135 s->MovMask &= ~mask;
136 if (s->BranchDepth == 0)
137 s->DefinedMask |= mask;
138 else
139 s->DefinedMask &= ~mask;
140 }
141 if (file == s->Mov->U.I.SrcReg[0].File && index == s->Mov->U.I.SrcReg[0].Index) {
142 if (mask & s->SourcedMask)
143 s->SourceClobbered = 1;
144 } else if (s->Mov->U.I.SrcReg[0].RelAddr && file == RC_FILE_ADDRESS) {
145 s->SourceClobbered = 1;
146 }
147 }
148
149 static void peephole(struct radeon_compiler * c, struct rc_instruction * inst_mov)
150 {
151 struct peephole_state s;
152
153 if (inst_mov->U.I.DstReg.File != RC_FILE_TEMPORARY ||
154 inst_mov->U.I.DstReg.RelAddr ||
155 inst_mov->U.I.WriteALUResult)
156 return;
157
158 memset(&s, 0, sizeof(s));
159 s.C = c;
160 s.Mov = inst_mov;
161 s.MovMask = inst_mov->U.I.DstReg.WriteMask;
162 s.DefinedMask = RC_MASK_XYZW & ~s.MovMask;
163
164 for(unsigned int chan = 0; chan < 4; ++chan) {
165 unsigned int swz = GET_SWZ(inst_mov->U.I.SrcReg[0].Swizzle, chan);
166 s.SourcedMask |= (1 << swz) & RC_MASK_XYZW;
167 }
168
169 /* 1st pass: Check whether all subsequent readers can be changed */
170 for(struct rc_instruction * inst = inst_mov->Next;
171 inst != &c->Program.Instructions;
172 inst = inst->Next) {
173 /* XXX In the future we might be able to make the optimizer
174 * smart enough to handle loops. */
175 if(inst->U.I.Opcode == RC_OPCODE_BGNLOOP
176 || inst->U.I.Opcode == RC_OPCODE_ENDLOOP){
177 return;
178 }
179 rc_for_all_reads_mask(inst, peephole_scan_read, &s);
180 rc_for_all_writes_mask(inst, peephole_scan_write, &s);
181 if (s.Conflict)
182 return;
183
184 if (s.BranchDepth >= 0) {
185 if (inst->U.I.Opcode == RC_OPCODE_IF) {
186 s.BranchDepth++;
187 } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
188 || inst->U.I.Opcode == RC_OPCODE_ELSE) {
189 s.BranchDepth--;
190 if (s.BranchDepth < 0) {
191 s.DefinedMask &= ~s.MovMask;
192 s.MovMask = 0;
193 }
194 }
195 }
196 }
197
198 if (s.Conflict)
199 return;
200
201 /* 2nd pass: We can satisfy all readers, so switch them over all at once */
202 s.MovMask = inst_mov->U.I.DstReg.WriteMask;
203 s.BranchDepth = 0;
204
205 for(struct rc_instruction * inst = inst_mov->Next;
206 inst != &c->Program.Instructions;
207 inst = inst->Next) {
208 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
209
210 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
211 if (inst->U.I.SrcReg[src].File == RC_FILE_TEMPORARY &&
212 inst->U.I.SrcReg[src].Index == s.Mov->U.I.DstReg.Index) {
213 unsigned int refmask = 0;
214
215 for(unsigned int chan = 0; chan < 4; ++chan) {
216 unsigned int swz = GET_SWZ(inst->U.I.SrcReg[src].Swizzle, chan);
217 refmask |= (1 << swz) & RC_MASK_XYZW;
218 }
219
220 if ((refmask & s.MovMask) == refmask)
221 inst->U.I.SrcReg[src] = chain_srcregs(inst->U.I.SrcReg[src], s.Mov->U.I.SrcReg[0]);
222 }
223 }
224
225 if (opcode->HasDstReg) {
226 if (inst->U.I.DstReg.File == RC_FILE_TEMPORARY &&
227 inst->U.I.DstReg.Index == s.Mov->U.I.DstReg.Index) {
228 s.MovMask &= ~inst->U.I.DstReg.WriteMask;
229 }
230 }
231
232 if (s.BranchDepth >= 0) {
233 if (inst->U.I.Opcode == RC_OPCODE_IF) {
234 s.BranchDepth++;
235 } else if (inst->U.I.Opcode == RC_OPCODE_ENDIF
236 || inst->U.I.Opcode == RC_OPCODE_ELSE) {
237 s.BranchDepth--;
238 if (s.BranchDepth < 0)
239 break; /* no more readers after this point */
240 }
241 }
242 }
243
244 /* Finally, remove the original MOV instruction */
245 rc_remove_instruction(inst_mov);
246 }
247
248 /**
249 * Check if a source register is actually always the same
250 * swizzle constant.
251 */
252 static int is_src_uniform_constant(struct rc_src_register src,
253 rc_swizzle * pswz, unsigned int * pnegate)
254 {
255 int have_used = 0;
256
257 if (src.File != RC_FILE_NONE) {
258 *pswz = 0;
259 return 0;
260 }
261
262 for(unsigned int chan = 0; chan < 4; ++chan) {
263 unsigned int swz = GET_SWZ(src.Swizzle, chan);
264 if (swz < 4) {
265 *pswz = 0;
266 return 0;
267 }
268 if (swz == RC_SWIZZLE_UNUSED)
269 continue;
270
271 if (!have_used) {
272 *pswz = swz;
273 *pnegate = GET_BIT(src.Negate, chan);
274 have_used = 1;
275 } else {
276 if (swz != *pswz || *pnegate != GET_BIT(src.Negate, chan)) {
277 *pswz = 0;
278 return 0;
279 }
280 }
281 }
282
283 return 1;
284 }
285
286
287 static void constant_folding_mad(struct rc_instruction * inst)
288 {
289 rc_swizzle swz;
290 unsigned int negate;
291
292 if (is_src_uniform_constant(inst->U.I.SrcReg[2], &swz, &negate)) {
293 if (swz == RC_SWIZZLE_ZERO) {
294 inst->U.I.Opcode = RC_OPCODE_MUL;
295 return;
296 }
297 }
298
299 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
300 if (swz == RC_SWIZZLE_ONE) {
301 inst->U.I.Opcode = RC_OPCODE_ADD;
302 if (negate)
303 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
304 inst->U.I.SrcReg[1] = inst->U.I.SrcReg[2];
305 return;
306 } else if (swz == RC_SWIZZLE_ZERO) {
307 inst->U.I.Opcode = RC_OPCODE_MOV;
308 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
309 return;
310 }
311 }
312
313 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
314 if (swz == RC_SWIZZLE_ONE) {
315 inst->U.I.Opcode = RC_OPCODE_ADD;
316 if (negate)
317 inst->U.I.SrcReg[1].Negate ^= RC_MASK_XYZW;
318 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
319 return;
320 } else if (swz == RC_SWIZZLE_ZERO) {
321 inst->U.I.Opcode = RC_OPCODE_MOV;
322 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[2];
323 return;
324 }
325 }
326 }
327
328 static void constant_folding_mul(struct rc_instruction * inst)
329 {
330 rc_swizzle swz;
331 unsigned int negate;
332
333 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
334 if (swz == RC_SWIZZLE_ONE) {
335 inst->U.I.Opcode = RC_OPCODE_MOV;
336 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
337 if (negate)
338 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
339 return;
340 } else if (swz == RC_SWIZZLE_ZERO) {
341 inst->U.I.Opcode = RC_OPCODE_MOV;
342 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
343 return;
344 }
345 }
346
347 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
348 if (swz == RC_SWIZZLE_ONE) {
349 inst->U.I.Opcode = RC_OPCODE_MOV;
350 if (negate)
351 inst->U.I.SrcReg[0].Negate ^= RC_MASK_XYZW;
352 return;
353 } else if (swz == RC_SWIZZLE_ZERO) {
354 inst->U.I.Opcode = RC_OPCODE_MOV;
355 inst->U.I.SrcReg[0].Swizzle = RC_SWIZZLE_0000;
356 return;
357 }
358 }
359 }
360
361 static void constant_folding_add(struct rc_instruction * inst)
362 {
363 rc_swizzle swz;
364 unsigned int negate;
365
366 if (is_src_uniform_constant(inst->U.I.SrcReg[0], &swz, &negate)) {
367 if (swz == RC_SWIZZLE_ZERO) {
368 inst->U.I.Opcode = RC_OPCODE_MOV;
369 inst->U.I.SrcReg[0] = inst->U.I.SrcReg[1];
370 return;
371 }
372 }
373
374 if (is_src_uniform_constant(inst->U.I.SrcReg[1], &swz, &negate)) {
375 if (swz == RC_SWIZZLE_ZERO) {
376 inst->U.I.Opcode = RC_OPCODE_MOV;
377 return;
378 }
379 }
380 }
381
382
383 /**
384 * Replace 0.0, 1.0 and 0.5 immediate constants by their
385 * respective swizzles. Simplify instructions like ADD dst, src, 0;
386 */
387 static void constant_folding(struct radeon_compiler * c, struct rc_instruction * inst)
388 {
389 const struct rc_opcode_info * opcode = rc_get_opcode_info(inst->U.I.Opcode);
390
391 /* Replace 0.0, 1.0 and 0.5 immediates by their explicit swizzles */
392 for(unsigned int src = 0; src < opcode->NumSrcRegs; ++src) {
393 if (inst->U.I.SrcReg[src].File != RC_FILE_CONSTANT ||
394 inst->U.I.SrcReg[src].RelAddr ||
395 inst->U.I.SrcReg[src].Index >= c->Program.Constants.Count)
396 continue;
397
398 struct rc_constant * constant =
399 &c->Program.Constants.Constants[inst->U.I.SrcReg[src].Index];
400
401 if (constant->Type != RC_CONSTANT_IMMEDIATE)
402 continue;
403
404 struct rc_src_register newsrc = inst->U.I.SrcReg[src];
405 int have_real_reference = 0;
406 for(unsigned int chan = 0; chan < 4; ++chan) {
407 unsigned int swz = GET_SWZ(newsrc.Swizzle, chan);
408 if (swz >= 4)
409 continue;
410
411 unsigned int newswz;
412 float imm = constant->u.Immediate[swz];
413 float baseimm = imm;
414 if (imm < 0.0)
415 baseimm = -baseimm;
416
417 if (baseimm == 0.0) {
418 newswz = RC_SWIZZLE_ZERO;
419 } else if (baseimm == 1.0) {
420 newswz = RC_SWIZZLE_ONE;
421 } else if (baseimm == 0.5 && c->has_half_swizzles) {
422 newswz = RC_SWIZZLE_HALF;
423 } else {
424 have_real_reference = 1;
425 continue;
426 }
427
428 SET_SWZ(newsrc.Swizzle, chan, newswz);
429 if (imm < 0.0 && !newsrc.Abs)
430 newsrc.Negate ^= 1 << chan;
431 }
432
433 if (!have_real_reference) {
434 newsrc.File = RC_FILE_NONE;
435 newsrc.Index = 0;
436 }
437
438 /* don't make the swizzle worse */
439 if (!c->SwizzleCaps->IsNative(inst->U.I.Opcode, newsrc) &&
440 c->SwizzleCaps->IsNative(inst->U.I.Opcode, inst->U.I.SrcReg[src]))
441 continue;
442
443 inst->U.I.SrcReg[src] = newsrc;
444 }
445
446 /* Simplify instructions based on constants */
447 if (inst->U.I.Opcode == RC_OPCODE_MAD)
448 constant_folding_mad(inst);
449
450 /* note: MAD can simplify to MUL or ADD */
451 if (inst->U.I.Opcode == RC_OPCODE_MUL)
452 constant_folding_mul(inst);
453 else if (inst->U.I.Opcode == RC_OPCODE_ADD)
454 constant_folding_add(inst);
455 }
456
457 void rc_optimize(struct radeon_compiler * c, void *user)
458 {
459 struct rc_instruction * inst = c->Program.Instructions.Next;
460 while(inst != &c->Program.Instructions) {
461 struct rc_instruction * cur = inst;
462 inst = inst->Next;
463
464 constant_folding(c, cur);
465
466 if (cur->U.I.Opcode == RC_OPCODE_MOV) {
467 peephole(c, cur);
468 /* cur may no longer be part of the program */
469 }
470 }
471 }