r600: use min_dx10/max_dx10 instead of min/max
[mesa.git] / src / gallium / drivers / r600 / sb / sb_expr.cpp
1 /*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Vadim Girlin
25 */
26
27 #include <cmath>
28
29 #include "sb_shader.h"
30
31 namespace r600_sb {
32
33 value* get_select_value_for_em(shader& sh, value* em) {
34 if (!em->def)
35 return NULL;
36
37 node *predset = em->def;
38 if (!predset->is_pred_set())
39 return NULL;
40
41 alu_node *s = sh.clone(static_cast<alu_node*>(predset));
42 convert_predset_to_set(sh, s);
43
44 predset->insert_after(s);
45
46 value* &d0 = s->dst[0];
47 d0 = sh.create_temp_value();
48 d0->def = s;
49 return d0;
50 }
51
52 void convert_to_mov(alu_node &n, value *src, bool neg, bool abs) {
53 n.src.resize(1);
54 n.src[0] = src;
55 n.bc.src[0].abs = abs;
56 n.bc.src[0].neg = neg;
57 n.bc.set_op(ALU_OP1_MOV);
58 }
59
60 expr_handler::expr_handler(shader& sh) : sh(sh), vt(sh.vt) {}
61
62 value * expr_handler::get_const(const literal &l) {
63 value *v = sh.get_const_value(l);
64 if (!v->gvn_source)
65 vt.add_value(v);
66 return v;
67 }
68
69 void expr_handler::assign_source(value *dst, value *src) {
70 dst->gvn_source = src->gvn_source;
71 }
72
73 bool expr_handler::equal(value *l, value *r) {
74
75 assert(l != r);
76
77 if (l->gvalue() == r->gvalue())
78 return true;
79
80 if (l->def && r->def)
81 return defs_equal(l, r);
82
83 if (l->is_rel() && r->is_rel())
84 return ivars_equal(l, r);
85
86 return false;
87 }
88
89 bool expr_handler::ivars_equal(value* l, value* r) {
90 if (l->rel->gvalue() == r->rel->gvalue()
91 && l->select == r->select) {
92
93 vvec &lv = l->mdef.empty() ? l->muse : l->mdef;
94 vvec &rv = r->mdef.empty() ? r->muse : r->mdef;
95
96 // FIXME: replace this with more precise aliasing test
97 return lv == rv;
98 }
99 return false;
100 }
101
102 bool expr_handler::defs_equal(value* l, value* r) {
103
104 node *d1 = l->def;
105 node *d2 = r->def;
106
107 if (d1->type != d2->type || d1->subtype != d2->subtype)
108 return false;
109
110 if (d1->is_pred_set() || d2->is_pred_set())
111 return false;
112
113 if (d1->type == NT_OP) {
114 switch (d1->subtype) {
115 case NST_ALU_INST:
116 return ops_equal(
117 static_cast<alu_node*>(d1),
118 static_cast<alu_node*>(d2));
119 // case NST_FETCH_INST: return ops_equal(static_cast<fetch_node*>(d1),
120 // static_cast<fetch_node*>(d2);
121 // case NST_CF_INST: return ops_equal(static_cast<cf_node*>(d1),
122 // static_cast<cf_node*>(d2);
123 default:
124 break;
125 }
126 }
127 return false;
128 }
129
130 bool expr_handler::try_fold(value* v) {
131 assert(!v->gvn_source);
132
133 if (v->def)
134 try_fold(v->def);
135
136 if (v->gvn_source)
137 return true;
138
139 return false;
140 }
141
142 bool expr_handler::try_fold(node* n) {
143 return n->fold_dispatch(this);
144 }
145
146 bool expr_handler::fold(node& n) {
147 if (n.subtype == NST_PHI) {
148
149 value *s = n.src[0];
150
151 // FIXME disabling phi folding for registers for now, otherwise we lose
152 // control flow information in some cases
153 // (GCM fails on tests/shaders/glsl-fs-if-nested-loop.shader_test)
154 // probably control flow transformation is required to enable it
155 if (s->is_sgpr())
156 return false;
157
158 for(vvec::iterator I = n.src.begin() + 1, E = n.src.end(); I != E; ++I) {
159 value *v = *I;
160 if (!s->v_equal(v))
161 return false;
162 }
163
164 assign_source(n.dst[0], s);
165 } else {
166 assert(n.subtype == NST_PSI);
167 assert(n.src.size() >= 6);
168
169 value *s = n.src[2];
170 assert(s->gvn_source);
171
172 for(vvec::iterator I = n.src.begin() + 3, E = n.src.end(); I != E; I += 3) {
173 value *v = *(I+2);
174 if (!s->v_equal(v))
175 return false;
176 }
177 assign_source(n.dst[0], s);
178 }
179 return true;
180 }
181
182 bool expr_handler::fold(container_node& n) {
183 return false;
184 }
185
186 bool expr_handler::fold_setcc(alu_node &n) {
187
188 value* v0 = n.src[0]->gvalue();
189 value* v1 = n.src[1]->gvalue();
190
191 assert(v0 && v1 && n.dst[0]);
192
193 unsigned flags = n.bc.op_ptr->flags;
194 unsigned cc = flags & AF_CC_MASK;
195 unsigned cmp_type = flags & AF_CMP_TYPE_MASK;
196 unsigned dst_type = flags & AF_DST_TYPE_MASK;
197
198 bool cond_result;
199 bool have_result = false;
200
201 bool isc0 = v0->is_const();
202 bool isc1 = v1->is_const();
203
204 literal dv, cv0, cv1;
205
206 if (isc0) {
207 cv0 = v0->get_const_value();
208 apply_alu_src_mod(n.bc, 0, cv0);
209 }
210
211 if (isc1) {
212 cv1 = v1->get_const_value();
213 apply_alu_src_mod(n.bc, 1, cv1);
214 }
215
216 if (isc0 && isc1) {
217 cond_result = evaluate_condition(flags, cv0, cv1);
218 have_result = true;
219 } else if (isc1) {
220 if (cmp_type == AF_FLOAT_CMP) {
221 if (n.bc.src[0].abs && !n.bc.src[0].neg) {
222 if (cv1.f < 0.0f && (cc == AF_CC_GT || cc == AF_CC_NE)) {
223 cond_result = true;
224 have_result = true;
225 } else if (cv1.f <= 0.0f && cc == AF_CC_GE) {
226 cond_result = true;
227 have_result = true;
228 }
229 } else if (n.bc.src[0].abs && n.bc.src[0].neg) {
230 if (cv1.f > 0.0f && (cc == AF_CC_GE || cc == AF_CC_E)) {
231 cond_result = false;
232 have_result = true;
233 } else if (cv1.f >= 0.0f && cc == AF_CC_GT) {
234 cond_result = false;
235 have_result = true;
236 }
237 }
238 } else if (cmp_type == AF_UINT_CMP && cv1.u == 0 && cc == AF_CC_GE) {
239 cond_result = true;
240 have_result = true;
241 }
242 } else if (isc0) {
243 if (cmp_type == AF_FLOAT_CMP) {
244 if (n.bc.src[1].abs && !n.bc.src[1].neg) {
245 if (cv0.f <= 0.0f && cc == AF_CC_GT) {
246 cond_result = false;
247 have_result = true;
248 } else if (cv0.f < 0.0f && (cc == AF_CC_GE || cc == AF_CC_E)) {
249 cond_result = false;
250 have_result = true;
251 }
252 } else if (n.bc.src[1].abs && n.bc.src[1].neg) {
253 if (cv0.f >= 0.0f && cc == AF_CC_GE) {
254 cond_result = true;
255 have_result = true;
256 } else if (cv0.f > 0.0f && (cc == AF_CC_GT || cc == AF_CC_NE)) {
257 cond_result = true;
258 have_result = true;
259 }
260 }
261 } else if (cmp_type == AF_UINT_CMP && cv0.u == 0 && cc == AF_CC_GT) {
262 cond_result = false;
263 have_result = true;
264 }
265 } else if (v0 == v1) {
266 bc_alu_src &s0 = n.bc.src[0], &s1 = n.bc.src[1];
267 if (s0.abs == s1.abs && s0.neg == s1.neg && cmp_type != AF_FLOAT_CMP) {
268 // NOTE can't handle float comparisons here because of NaNs
269 cond_result = (cc == AF_CC_E || cc == AF_CC_GE);
270 have_result = true;
271 }
272 }
273
274 if (have_result) {
275 literal result;
276
277 if (cond_result)
278 result = dst_type != AF_FLOAT_DST ?
279 literal(0xFFFFFFFFu) : literal(1.0f);
280 else
281 result = literal(0);
282
283 convert_to_mov(n, sh.get_const_value(result));
284 return fold_alu_op1(n);
285 }
286
287 return false;
288 }
289
290 bool expr_handler::fold(alu_node& n) {
291
292 switch (n.bc.op_ptr->src_count) {
293 case 1: return fold_alu_op1(n);
294 case 2: return fold_alu_op2(n);
295 case 3: return fold_alu_op3(n);
296 default:
297 assert(0);
298 }
299 return false;
300 }
301
302 bool expr_handler::fold(fetch_node& n) {
303
304 unsigned chan = 0;
305 for (vvec::iterator I = n.dst.begin(), E = n.dst.end(); I != E; ++I) {
306 value* &v = *I;
307 if (v) {
308 if (n.bc.dst_sel[chan] == SEL_0)
309 assign_source(*I, get_const(0.0f));
310 else if (n.bc.dst_sel[chan] == SEL_1)
311 assign_source(*I, get_const(1.0f));
312 }
313 ++chan;
314 }
315 return false;
316 }
317
318 bool expr_handler::fold(cf_node& n) {
319 return false;
320 }
321
322 void expr_handler::apply_alu_src_mod(const bc_alu &bc, unsigned src,
323 literal &v) {
324 const bc_alu_src &s = bc.src[src];
325
326 if (s.abs)
327 v = fabs(v.f);
328 if (s.neg)
329 v = -v.f;
330 }
331
332 void expr_handler::apply_alu_dst_mod(const bc_alu &bc, literal &v) {
333 float omod_coeff[] = {2.0f, 4.0, 0.5f};
334
335 if (bc.omod)
336 v = v.f * omod_coeff[bc.omod - 1];
337 if (bc.clamp)
338 v = float_clamp(v.f);
339 }
340
341 bool expr_handler::args_equal(const vvec &l, const vvec &r) {
342
343 assert(l.size() == r.size());
344
345 int s = l.size();
346
347 for (int k = 0; k < s; ++k) {
348 if (!l[k]->v_equal(r[k]))
349 return false;
350 }
351
352 return true;
353 }
354
355 bool expr_handler::ops_equal(const alu_node *l, const alu_node* r) {
356 const bc_alu &b0 = l->bc;
357 const bc_alu &b1 = r->bc;
358
359 if (b0.op != b1.op)
360 return false;
361
362 unsigned src_count = b0.op_ptr->src_count;
363
364 if (b0.index_mode != b1.index_mode)
365 return false;
366
367 if (b0.clamp != b1.clamp || b0.omod != b1.omod)
368 return false;
369
370 for (unsigned s = 0; s < src_count; ++s) {
371 const bc_alu_src &s0 = b0.src[s];
372 const bc_alu_src &s1 = b1.src[s];
373
374 if (s0.abs != s1.abs || s0.neg != s1.neg)
375 return false;
376 }
377 return args_equal(l->src, r->src);
378 }
379
380 bool expr_handler::fold_alu_op1(alu_node& n) {
381
382 assert(!n.src.empty());
383 if (n.src.empty())
384 return false;
385
386 value* v0 = n.src[0]->gvalue();
387
388 assert(v0 && n.dst[0]);
389
390 if (!v0->is_const()) {
391 // handle (MOV -(MOV -x)) => (MOV x)
392 if (n.bc.op == ALU_OP1_MOV && n.bc.src[0].neg && !n.bc.src[1].abs
393 && v0->def && v0->def->is_alu_op(ALU_OP1_MOV)) {
394 alu_node *sd = static_cast<alu_node*>(v0->def);
395 if (!sd->bc.clamp && !sd->bc.omod && !sd->bc.src[0].abs &&
396 sd->bc.src[0].neg) {
397 n.src[0] = sd->src[0];
398 n.bc.src[0].neg = 0;
399 v0 = n.src[0]->gvalue();
400 }
401 }
402
403 if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT ||
404 n.bc.op == ALU_OP1_MOVA_GPR_INT)
405 && n.bc.clamp == 0 && n.bc.omod == 0
406 && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0 &&
407 n.src.size() == 1 /* RIM/SIM can be appended as additional values */) {
408 assign_source(n.dst[0], v0);
409 return true;
410 }
411 return false;
412 }
413
414 literal dv, cv = v0->get_const_value();
415 apply_alu_src_mod(n.bc, 0, cv);
416
417 switch (n.bc.op) {
418 case ALU_OP1_CEIL: dv = ceil(cv.f); break;
419 case ALU_OP1_COS: dv = cos(cv.f * 2.0f * M_PI); break;
420 case ALU_OP1_EXP_IEEE: dv = exp2(cv.f); break;
421 case ALU_OP1_FLOOR: dv = floor(cv.f); break;
422 case ALU_OP1_FLT_TO_INT: dv = (int)cv.f; break; // FIXME: round modes ????
423 case ALU_OP1_FLT_TO_INT_FLOOR: dv = (int32_t)floor(cv.f); break;
424 case ALU_OP1_FLT_TO_INT_RPI: dv = (int32_t)floor(cv.f + 0.5f); break;
425 case ALU_OP1_FLT_TO_INT_TRUNC: dv = (int32_t)trunc(cv.f); break;
426 case ALU_OP1_FLT_TO_UINT: dv = (uint32_t)cv.f; break;
427 case ALU_OP1_FRACT: dv = cv.f - floor(cv.f); break;
428 case ALU_OP1_INT_TO_FLT: dv = (float)cv.i; break;
429 case ALU_OP1_LOG_CLAMPED:
430 case ALU_OP1_LOG_IEEE:
431 if (cv.f != 0.0f)
432 dv = log2(cv.f);
433 else
434 // don't fold to NAN, let the GPU handle it for now
435 // (prevents degenerate LIT tests from failing)
436 return false;
437 break;
438 case ALU_OP1_MOV: dv = cv; break;
439 case ALU_OP1_MOVA_INT: dv = cv; break; // FIXME ???
440 // case ALU_OP1_MOVA_FLOOR: dv = (int32_t)floor(cv.f); break;
441 // case ALU_OP1_MOVA_GPR_INT:
442 case ALU_OP1_NOT_INT: dv = ~cv.i; break;
443 case ALU_OP1_PRED_SET_INV:
444 dv = cv.f == 0.0f ? 1.0f : (cv.f == 1.0f ? 0.0f : cv.f); break;
445 case ALU_OP1_PRED_SET_RESTORE: dv = cv; break;
446 case ALU_OP1_RECIPSQRT_CLAMPED:
447 case ALU_OP1_RECIPSQRT_FF:
448 case ALU_OP1_RECIPSQRT_IEEE: dv = 1.0f / sqrt(cv.f); break;
449 case ALU_OP1_RECIP_CLAMPED:
450 case ALU_OP1_RECIP_FF:
451 case ALU_OP1_RECIP_IEEE: dv = 1.0f / cv.f; break;
452 // case ALU_OP1_RECIP_INT:
453 case ALU_OP1_RECIP_UINT: dv.u = (1ull << 32) / cv.u; break;
454 // case ALU_OP1_RNDNE: dv = floor(cv.f + 0.5f); break;
455 case ALU_OP1_SIN: dv = sin(cv.f * 2.0f * M_PI); break;
456 case ALU_OP1_SQRT_IEEE: dv = sqrt(cv.f); break;
457 case ALU_OP1_TRUNC: dv = trunc(cv.f); break;
458
459 default:
460 return false;
461 }
462
463 apply_alu_dst_mod(n.bc, dv);
464 assign_source(n.dst[0], get_const(dv));
465 return true;
466 }
467
468 bool expr_handler::fold_mul_add(alu_node *n) {
469
470 bool ieee;
471 value* v0 = n->src[0]->gvalue();
472
473 alu_node *d0 = (v0->def && v0->def->is_alu_inst()) ?
474 static_cast<alu_node*>(v0->def) : NULL;
475
476 if (d0) {
477 if (d0->is_alu_op(ALU_OP2_MUL_IEEE))
478 ieee = true;
479 else if (d0->is_alu_op(ALU_OP2_MUL))
480 ieee = false;
481 else
482 return false;
483
484 if (!d0->bc.src[0].abs && !d0->bc.src[1].abs &&
485 !n->bc.src[1].abs && !n->bc.src[0].abs && !d0->bc.omod &&
486 !d0->bc.clamp && !n->bc.omod &&
487 (!d0->src[0]->is_kcache() || !d0->src[1]->is_kcache() ||
488 !n->src[1]->is_kcache())) {
489
490 bool mul_neg = n->bc.src[0].neg;
491
492 n->src.resize(3);
493 n->bc.set_op(ieee ? ALU_OP3_MULADD_IEEE : ALU_OP3_MULADD);
494 n->src[2] = n->src[1];
495 n->bc.src[2] = n->bc.src[1];
496 n->src[0] = d0->src[0];
497 n->bc.src[0] = d0->bc.src[0];
498 n->src[1] = d0->src[1];
499 n->bc.src[1] = d0->bc.src[1];
500
501 n->bc.src[0].neg ^= mul_neg;
502
503 fold_alu_op3(*n);
504 return true;
505 }
506 }
507
508 value* v1 = n->src[1]->gvalue();
509
510 alu_node *d1 = (v1->def && v1->def->is_alu_inst()) ?
511 static_cast<alu_node*>(v1->def) : NULL;
512
513 if (d1) {
514 if (d1->is_alu_op(ALU_OP2_MUL_IEEE))
515 ieee = true;
516 else if (d1->is_alu_op(ALU_OP2_MUL))
517 ieee = false;
518 else
519 return false;
520
521 if (!d1->bc.src[1].abs && !d1->bc.src[0].abs &&
522 !n->bc.src[0].abs && !n->bc.src[1].abs && !d1->bc.omod &&
523 !d1->bc.clamp && !n->bc.omod &&
524 (!d1->src[0]->is_kcache() || !d1->src[1]->is_kcache() ||
525 !n->src[0]->is_kcache())) {
526
527 bool mul_neg = n->bc.src[1].neg;
528
529 n->src.resize(3);
530 n->bc.set_op(ieee ? ALU_OP3_MULADD_IEEE : ALU_OP3_MULADD);
531 n->src[2] = n->src[0];
532 n->bc.src[2] = n->bc.src[0];
533 n->src[1] = d1->src[1];
534 n->bc.src[1] = d1->bc.src[1];
535 n->src[0] = d1->src[0];
536 n->bc.src[0] = d1->bc.src[0];
537
538 n->bc.src[1].neg ^= mul_neg;
539
540 fold_alu_op3(*n);
541 return true;
542 }
543 }
544
545 return false;
546 }
547
548 bool expr_handler::eval_const_op(unsigned op, literal &r,
549 literal cv0, literal cv1) {
550
551 switch (op) {
552 case ALU_OP2_ADD: r = cv0.f + cv1.f; break;
553 case ALU_OP2_ADDC_UINT:
554 r = (uint32_t)(((uint64_t)cv0.u + cv1.u)>>32); break;
555 case ALU_OP2_ADD_INT: r = cv0.i + cv1.i; break;
556 case ALU_OP2_AND_INT: r = cv0.i & cv1.i; break;
557 case ALU_OP2_ASHR_INT: r = cv0.i >> (cv1.i & 0x1F); break;
558 case ALU_OP2_BFM_INT:
559 r = (((1 << (cv0.i & 0x1F)) - 1) << (cv1.i & 0x1F)); break;
560 case ALU_OP2_LSHL_INT: r = cv0.i << cv1.i; break;
561 case ALU_OP2_LSHR_INT: r = cv0.u >> cv1.u; break;
562 case ALU_OP2_MAX:
563 case ALU_OP2_MAX_DX10: r = cv0.f > cv1.f ? cv0.f : cv1.f; break;
564 case ALU_OP2_MAX_INT: r = cv0.i > cv1.i ? cv0.i : cv1.i; break;
565 case ALU_OP2_MAX_UINT: r = cv0.u > cv1.u ? cv0.u : cv1.u; break;
566 case ALU_OP2_MIN:
567 case ALU_OP2_MIN_DX10: r = cv0.f < cv1.f ? cv0.f : cv1.f; break;
568 case ALU_OP2_MIN_INT: r = cv0.i < cv1.i ? cv0.i : cv1.i; break;
569 case ALU_OP2_MIN_UINT: r = cv0.u < cv1.u ? cv0.u : cv1.u; break;
570 case ALU_OP2_MUL:
571 case ALU_OP2_MUL_IEEE: r = cv0.f * cv1.f; break;
572 case ALU_OP2_MULHI_INT:
573 r = (int32_t)(((int64_t)cv0.u * cv1.u)>>32); break;
574 case ALU_OP2_MULHI_UINT:
575 r = (uint32_t)(((uint64_t)cv0.u * cv1.u)>>32); break;
576 case ALU_OP2_MULLO_INT:
577 r = (int32_t)(((int64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break;
578 case ALU_OP2_MULLO_UINT:
579 r = (uint32_t)(((uint64_t)cv0.u * cv1.u) & 0xFFFFFFFF); break;
580 case ALU_OP2_OR_INT: r = cv0.i | cv1.i; break;
581 case ALU_OP2_SUB_INT: r = cv0.i - cv1.i; break;
582 case ALU_OP2_XOR_INT: r = cv0.i ^ cv1.i; break;
583
584 default:
585 return false;
586 }
587
588 return true;
589 }
590
591 // fold the chain of associative ops, e.g. (ADD 2, (ADD x, 3)) => (ADD x, 5)
592 bool expr_handler::fold_assoc(alu_node *n) {
593
594 alu_node *a = n;
595 literal cr;
596
597 int last_arg = -3;
598
599 unsigned op = n->bc.op;
600 bool allow_neg = false, cur_neg = false;
601 bool distribute_neg = false;
602
603 switch(op) {
604 case ALU_OP2_ADD:
605 distribute_neg = true;
606 allow_neg = true;
607 break;
608 case ALU_OP2_MUL:
609 case ALU_OP2_MUL_IEEE:
610 allow_neg = true;
611 break;
612 case ALU_OP3_MULADD:
613 allow_neg = true;
614 op = ALU_OP2_MUL;
615 break;
616 case ALU_OP3_MULADD_IEEE:
617 allow_neg = true;
618 op = ALU_OP2_MUL_IEEE;
619 break;
620 default:
621 if (n->bc.op_ptr->src_count != 2)
622 return false;
623 }
624
625 // check if we can evaluate the op
626 if (!eval_const_op(op, cr, literal(0), literal(0)))
627 return false;
628
629 while (true) {
630
631 value *v0 = a->src[0]->gvalue();
632 value *v1 = a->src[1]->gvalue();
633
634 last_arg = -2;
635
636 if (v1->is_const()) {
637 literal arg = v1->get_const_value();
638 apply_alu_src_mod(a->bc, 1, arg);
639 if (cur_neg && distribute_neg)
640 arg.f = -arg.f;
641
642 if (a == n)
643 cr = arg;
644 else
645 eval_const_op(op, cr, cr, arg);
646
647 if (v0->def) {
648 alu_node *d0 = static_cast<alu_node*>(v0->def);
649 if ((d0->is_alu_op(op) ||
650 (op == ALU_OP2_MUL_IEEE &&
651 d0->is_alu_op(ALU_OP2_MUL))) &&
652 !d0->bc.omod && !d0->bc.clamp &&
653 !a->bc.src[0].abs &&
654 (!a->bc.src[0].neg || allow_neg)) {
655 cur_neg ^= a->bc.src[0].neg;
656 a = d0;
657 continue;
658 }
659 }
660 last_arg = 0;
661
662 }
663
664 if (v0->is_const()) {
665 literal arg = v0->get_const_value();
666 apply_alu_src_mod(a->bc, 0, arg);
667 if (cur_neg && distribute_neg)
668 arg.f = -arg.f;
669
670 if (last_arg == 0) {
671 eval_const_op(op, cr, cr, arg);
672 last_arg = -1;
673 break;
674 }
675
676 if (a == n)
677 cr = arg;
678 else
679 eval_const_op(op, cr, cr, arg);
680
681 if (v1->def) {
682 alu_node *d1 = static_cast<alu_node*>(v1->def);
683 if ((d1->is_alu_op(op) ||
684 (op == ALU_OP2_MUL_IEEE &&
685 d1->is_alu_op(ALU_OP2_MUL))) &&
686 !d1->bc.omod && !d1->bc.clamp &&
687 !a->bc.src[1].abs &&
688 (!a->bc.src[1].neg || allow_neg)) {
689 cur_neg ^= a->bc.src[1].neg;
690 a = d1;
691 continue;
692 }
693 }
694
695 last_arg = 1;
696 }
697
698 break;
699 };
700
701 if (last_arg == -1) {
702 // result is const
703 apply_alu_dst_mod(n->bc, cr);
704
705 if (n->bc.op == op) {
706 convert_to_mov(*n, sh.get_const_value(cr));
707 fold_alu_op1(*n);
708 return true;
709 } else { // MULADD => ADD
710 n->src[0] = n->src[2];
711 n->bc.src[0] = n->bc.src[2];
712 n->src[1] = sh.get_const_value(cr);
713 memset(&n->bc.src[1], 0, sizeof(bc_alu_src));
714
715 n->src.resize(2);
716 n->bc.set_op(ALU_OP2_ADD);
717 }
718 } else if (last_arg >= 0) {
719 n->src[0] = a->src[last_arg];
720 n->bc.src[0] = a->bc.src[last_arg];
721 n->bc.src[0].neg ^= cur_neg;
722 n->src[1] = sh.get_const_value(cr);
723 memset(&n->bc.src[1], 0, sizeof(bc_alu_src));
724 }
725
726 return false;
727 }
728
729 bool expr_handler::fold_alu_op2(alu_node& n) {
730
731 if (n.src.size() < 2)
732 return false;
733
734 unsigned flags = n.bc.op_ptr->flags;
735
736 if (flags & AF_SET) {
737 return fold_setcc(n);
738 }
739
740 if (!sh.safe_math && (flags & AF_M_ASSOC)) {
741 if (fold_assoc(&n))
742 return true;
743 }
744
745 value* v0 = n.src[0]->gvalue();
746 value* v1 = n.src[1]->gvalue();
747
748 assert(v0 && v1);
749
750 // handle some operations with equal args, e.g. x + x => x * 2
751 if (v0 == v1) {
752 if (n.bc.src[0].neg == n.bc.src[1].neg &&
753 n.bc.src[0].abs == n.bc.src[1].abs) {
754 switch (n.bc.op) {
755 case ALU_OP2_MIN: // (MIN x, x) => (MOV x)
756 case ALU_OP2_MIN_DX10:
757 case ALU_OP2_MAX:
758 case ALU_OP2_MAX_DX10:
759 convert_to_mov(n, v0, n.bc.src[0].neg, n.bc.src[0].abs);
760 return fold_alu_op1(n);
761 case ALU_OP2_ADD: // (ADD x, x) => (MUL x, 2)
762 if (!sh.safe_math) {
763 n.src[1] = sh.get_const_value(2.0f);
764 memset(&n.bc.src[1], 0, sizeof(bc_alu_src));
765 n.bc.set_op(ALU_OP2_MUL);
766 return fold_alu_op2(n);
767 }
768 break;
769 }
770 }
771 if (n.bc.src[0].neg != n.bc.src[1].neg &&
772 n.bc.src[0].abs == n.bc.src[1].abs) {
773 switch (n.bc.op) {
774 case ALU_OP2_ADD: // (ADD x, -x) => (MOV 0)
775 if (!sh.safe_math) {
776 convert_to_mov(n, sh.get_const_value(literal(0)));
777 return fold_alu_op1(n);
778 }
779 break;
780 }
781 }
782 }
783
784 if (n.bc.op == ALU_OP2_ADD) {
785 if (fold_mul_add(&n))
786 return true;
787 }
788
789 bool isc0 = v0->is_const();
790 bool isc1 = v1->is_const();
791
792 if (!isc0 && !isc1)
793 return false;
794
795 literal dv, cv0, cv1;
796
797 if (isc0) {
798 cv0 = v0->get_const_value();
799 apply_alu_src_mod(n.bc, 0, cv0);
800 }
801
802 if (isc1) {
803 cv1 = v1->get_const_value();
804 apply_alu_src_mod(n.bc, 1, cv1);
805 }
806
807 if (isc0 && isc1) {
808
809 if (!eval_const_op(n.bc.op, dv, cv0, cv1))
810 return false;
811
812 } else { // one source is const
813
814 if (isc0 && cv0 == literal(0)) {
815 switch (n.bc.op) {
816 case ALU_OP2_ADD:
817 case ALU_OP2_ADD_INT:
818 case ALU_OP2_MAX_UINT:
819 case ALU_OP2_OR_INT:
820 case ALU_OP2_XOR_INT:
821 convert_to_mov(n, n.src[1], n.bc.src[1].neg, n.bc.src[1].abs);
822 return fold_alu_op1(n);
823 case ALU_OP2_AND_INT:
824 case ALU_OP2_ASHR_INT:
825 case ALU_OP2_LSHL_INT:
826 case ALU_OP2_LSHR_INT:
827 case ALU_OP2_MIN_UINT:
828 case ALU_OP2_MUL:
829 case ALU_OP2_MULHI_UINT:
830 case ALU_OP2_MULLO_UINT:
831 convert_to_mov(n, sh.get_const_value(literal(0)));
832 return fold_alu_op1(n);
833 }
834 } else if (isc1 && cv1 == literal(0)) {
835 switch (n.bc.op) {
836 case ALU_OP2_ADD:
837 case ALU_OP2_ADD_INT:
838 case ALU_OP2_ASHR_INT:
839 case ALU_OP2_LSHL_INT:
840 case ALU_OP2_LSHR_INT:
841 case ALU_OP2_MAX_UINT:
842 case ALU_OP2_OR_INT:
843 case ALU_OP2_SUB_INT:
844 case ALU_OP2_XOR_INT:
845 convert_to_mov(n, n.src[0], n.bc.src[0].neg, n.bc.src[0].abs);
846 return fold_alu_op1(n);
847 case ALU_OP2_AND_INT:
848 case ALU_OP2_MIN_UINT:
849 case ALU_OP2_MUL:
850 case ALU_OP2_MULHI_UINT:
851 case ALU_OP2_MULLO_UINT:
852 convert_to_mov(n, sh.get_const_value(literal(0)));
853 return fold_alu_op1(n);
854 }
855 } else if (isc0 && cv0 == literal(1.0f)) {
856 switch (n.bc.op) {
857 case ALU_OP2_MUL:
858 case ALU_OP2_MUL_IEEE:
859 convert_to_mov(n, n.src[1], n.bc.src[1].neg, n.bc.src[1].abs);
860 return fold_alu_op1(n);
861 }
862 } else if (isc1 && cv1 == literal(1.0f)) {
863 switch (n.bc.op) {
864 case ALU_OP2_MUL:
865 case ALU_OP2_MUL_IEEE:
866 convert_to_mov(n, n.src[0], n.bc.src[0].neg, n.bc.src[0].abs);
867 return fold_alu_op1(n);
868 }
869 }
870
871 return false;
872 }
873
874 apply_alu_dst_mod(n.bc, dv);
875 assign_source(n.dst[0], get_const(dv));
876 return true;
877 }
878
879 bool expr_handler::evaluate_condition(unsigned alu_cnd_flags,
880 literal s1, literal s2) {
881
882 unsigned cmp_type = alu_cnd_flags & AF_CMP_TYPE_MASK;
883 unsigned cc = alu_cnd_flags & AF_CC_MASK;
884
885 switch (cmp_type) {
886 case AF_FLOAT_CMP: {
887 switch (cc) {
888 case AF_CC_E : return s1.f == s2.f;
889 case AF_CC_GT: return s1.f > s2.f;
890 case AF_CC_GE: return s1.f >= s2.f;
891 case AF_CC_NE: return s1.f != s2.f;
892 case AF_CC_LT: return s1.f < s2.f;
893 case AF_CC_LE: return s1.f <= s2.f;
894 default:
895 assert(!"invalid condition code");
896 return false;
897 }
898 }
899 case AF_INT_CMP: {
900 switch (cc) {
901 case AF_CC_E : return s1.i == s2.i;
902 case AF_CC_GT: return s1.i > s2.i;
903 case AF_CC_GE: return s1.i >= s2.i;
904 case AF_CC_NE: return s1.i != s2.i;
905 case AF_CC_LT: return s1.i < s2.i;
906 case AF_CC_LE: return s1.i <= s2.i;
907 default:
908 assert(!"invalid condition code");
909 return false;
910 }
911 }
912 case AF_UINT_CMP: {
913 switch (cc) {
914 case AF_CC_E : return s1.u == s2.u;
915 case AF_CC_GT: return s1.u > s2.u;
916 case AF_CC_GE: return s1.u >= s2.u;
917 case AF_CC_NE: return s1.u != s2.u;
918 case AF_CC_LT: return s1.u < s2.u;
919 case AF_CC_LE: return s1.u <= s2.u;
920 default:
921 assert(!"invalid condition code");
922 return false;
923 }
924 }
925 default:
926 assert(!"invalid cmp_type");
927 return false;
928 }
929 }
930
931 bool expr_handler::fold_alu_op3(alu_node& n) {
932
933 if (n.src.size() < 3)
934 return false;
935
936 if (!sh.safe_math && (n.bc.op_ptr->flags & AF_M_ASSOC)) {
937 if (fold_assoc(&n))
938 return true;
939 }
940
941 value* v0 = n.src[0]->gvalue();
942 value* v1 = n.src[1]->gvalue();
943 value* v2 = n.src[2]->gvalue();
944
945 assert(v0 && v1 && v2 && n.dst[0]);
946
947 bool isc0 = v0->is_const();
948 bool isc1 = v1->is_const();
949 bool isc2 = v2->is_const();
950
951 literal dv, cv0, cv1, cv2;
952
953 if (isc0) {
954 cv0 = v0->get_const_value();
955 apply_alu_src_mod(n.bc, 0, cv0);
956 }
957
958 if (isc1) {
959 cv1 = v1->get_const_value();
960 apply_alu_src_mod(n.bc, 1, cv1);
961 }
962
963 if (isc2) {
964 cv2 = v2->get_const_value();
965 apply_alu_src_mod(n.bc, 2, cv2);
966 }
967
968 unsigned flags = n.bc.op_ptr->flags;
969
970 if (flags & AF_CMOV) {
971 int src = 0;
972
973 if (v1 == v2 && n.bc.src[1].neg == n.bc.src[2].neg) {
974 // result doesn't depend on condition, convert to MOV
975 src = 1;
976 } else if (isc0) {
977 // src0 is const, condition can be evaluated, convert to MOV
978 bool cond = evaluate_condition(n.bc.op_ptr->flags & (AF_CC_MASK |
979 AF_CMP_TYPE_MASK), cv0, literal(0));
980 src = cond ? 1 : 2;
981 }
982
983 if (src) {
984 // if src is selected, convert to MOV
985 convert_to_mov(n, n.src[src], n.bc.src[src].neg);
986 return fold_alu_op1(n);
987 }
988 }
989
990 // handle (MULADD a, x, MUL (x, b)) => (MUL x, ADD (a, b))
991 if (!sh.safe_math && (n.bc.op == ALU_OP3_MULADD ||
992 n.bc.op == ALU_OP3_MULADD_IEEE)) {
993
994 unsigned op = n.bc.op == ALU_OP3_MULADD_IEEE ?
995 ALU_OP2_MUL_IEEE : ALU_OP2_MUL;
996
997 if (!isc2 && v2->def && v2->def->is_alu_op(op)) {
998
999 alu_node *md = static_cast<alu_node*>(v2->def);
1000 value *mv0 = md->src[0]->gvalue();
1001 value *mv1 = md->src[1]->gvalue();
1002
1003 int es0 = -1, es1;
1004
1005 if (v0 == mv0) {
1006 es0 = 0;
1007 es1 = 0;
1008 } else if (v0 == mv1) {
1009 es0 = 0;
1010 es1 = 1;
1011 } else if (v1 == mv0) {
1012 es0 = 1;
1013 es1 = 0;
1014 } else if (v1 == mv1) {
1015 es0 = 1;
1016 es1 = 1;
1017 }
1018
1019 if (es0 != -1) {
1020 value *va0 = es0 == 0 ? v1 : v0;
1021 value *va1 = es1 == 0 ? mv1 : mv0;
1022
1023 alu_node *add = sh.create_alu();
1024 add->bc.set_op(ALU_OP2_ADD);
1025
1026 add->dst.resize(1);
1027 add->src.resize(2);
1028
1029 value *t = sh.create_temp_value();
1030 t->def = add;
1031 add->dst[0] = t;
1032 add->src[0] = va0;
1033 add->src[1] = va1;
1034 add->bc.src[0] = n.bc.src[!es0];
1035 add->bc.src[1] = md->bc.src[!es1];
1036
1037 add->bc.src[1].neg ^= n.bc.src[2].neg ^
1038 (n.bc.src[es0].neg != md->bc.src[es1].neg);
1039
1040 n.insert_before(add);
1041 vt.add_value(t);
1042
1043 t = t->gvalue();
1044
1045 if (es0 == 1) {
1046 n.src[0] = n.src[1];
1047 n.bc.src[0] = n.bc.src[1];
1048 }
1049
1050 n.src[1] = t;
1051 memset(&n.bc.src[1], 0, sizeof(bc_alu_src));
1052
1053 n.src.resize(2);
1054
1055 n.bc.set_op(op);
1056 return fold_alu_op2(n);
1057 }
1058 }
1059 }
1060
1061 if (!isc0 && !isc1 && !isc2)
1062 return false;
1063
1064 if (isc0 && isc1 && isc2) {
1065 switch (n.bc.op) {
1066 case ALU_OP3_MULADD_IEEE:
1067 case ALU_OP3_MULADD: dv = cv0.f * cv1.f + cv2.f; break;
1068
1069 // TODO
1070
1071 default:
1072 return false;
1073 }
1074 } else {
1075 if (isc0 && isc1) {
1076 switch (n.bc.op) {
1077 case ALU_OP3_MULADD:
1078 case ALU_OP3_MULADD_IEEE:
1079 dv = cv0.f * cv1.f;
1080 n.bc.set_op(ALU_OP2_ADD);
1081 n.src[0] = sh.get_const_value(dv);
1082 memset(&n.bc.src[0], 0, sizeof(bc_alu_src));
1083 n.src[1] = n.src[2];
1084 n.bc.src[1] = n.bc.src[2];
1085 n.src.resize(2);
1086 return fold_alu_op2(n);
1087 }
1088 }
1089
1090 if (n.bc.op == ALU_OP3_MULADD) {
1091 if ((isc0 && cv0 == literal(0)) || (isc1 && cv1 == literal(0))) {
1092 convert_to_mov(n, n.src[2], n.bc.src[2].neg, n.bc.src[2].abs);
1093 return fold_alu_op1(n);
1094 }
1095 }
1096
1097 if (n.bc.op == ALU_OP3_MULADD || n.bc.op == ALU_OP3_MULADD_IEEE) {
1098 unsigned op = n.bc.op == ALU_OP3_MULADD_IEEE ?
1099 ALU_OP2_MUL_IEEE : ALU_OP2_MUL;
1100
1101 if (isc1 && v0 == v2) {
1102 cv1.f += (n.bc.src[2].neg != n.bc.src[0].neg ? -1.0f : 1.0f);
1103 n.src[1] = sh.get_const_value(cv1);
1104 n.bc.src[1].neg = 0;
1105 n.bc.src[1].abs = 0;
1106 n.bc.set_op(op);
1107 n.src.resize(2);
1108 return fold_alu_op2(n);
1109 } else if (isc0 && v1 == v2) {
1110 cv0.f += (n.bc.src[2].neg != n.bc.src[1].neg ? -1.0f : 1.0f);
1111 n.src[0] = sh.get_const_value(cv0);
1112 n.bc.src[0].neg = 0;
1113 n.bc.src[0].abs = 0;
1114 n.bc.set_op(op);
1115 n.src.resize(2);
1116 return fold_alu_op2(n);
1117 }
1118 }
1119
1120 return false;
1121 }
1122
1123 apply_alu_dst_mod(n.bc, dv);
1124 assign_source(n.dst[0], get_const(dv));
1125 return true;
1126 }
1127
1128 unsigned invert_setcc_condition(unsigned cc, bool &swap_args) {
1129 unsigned ncc = 0;
1130
1131 switch (cc) {
1132 case AF_CC_E: ncc = AF_CC_NE; break;
1133 case AF_CC_NE: ncc = AF_CC_E; break;
1134 case AF_CC_GE: ncc = AF_CC_GT; swap_args = true; break;
1135 case AF_CC_GT: ncc = AF_CC_GE; swap_args = true; break;
1136 default:
1137 assert(!"unexpected condition code");
1138 break;
1139 }
1140 return ncc;
1141 }
1142
1143 unsigned get_setcc_op(unsigned cc, unsigned cmp_type, bool int_dst) {
1144
1145 if (int_dst && cmp_type == AF_FLOAT_CMP) {
1146 switch (cc) {
1147 case AF_CC_E: return ALU_OP2_SETE_DX10;
1148 case AF_CC_NE: return ALU_OP2_SETNE_DX10;
1149 case AF_CC_GT: return ALU_OP2_SETGT_DX10;
1150 case AF_CC_GE: return ALU_OP2_SETGE_DX10;
1151 }
1152 } else {
1153
1154 switch(cmp_type) {
1155 case AF_FLOAT_CMP: {
1156 switch (cc) {
1157 case AF_CC_E: return ALU_OP2_SETE;
1158 case AF_CC_NE: return ALU_OP2_SETNE;
1159 case AF_CC_GT: return ALU_OP2_SETGT;
1160 case AF_CC_GE: return ALU_OP2_SETGE;
1161 }
1162 break;
1163 }
1164 case AF_INT_CMP: {
1165 switch (cc) {
1166 case AF_CC_E: return ALU_OP2_SETE_INT;
1167 case AF_CC_NE: return ALU_OP2_SETNE_INT;
1168 case AF_CC_GT: return ALU_OP2_SETGT_INT;
1169 case AF_CC_GE: return ALU_OP2_SETGE_INT;
1170 }
1171 break;
1172 }
1173 case AF_UINT_CMP: {
1174 switch (cc) {
1175 case AF_CC_E: return ALU_OP2_SETE_INT;
1176 case AF_CC_NE: return ALU_OP2_SETNE_INT;
1177 case AF_CC_GT: return ALU_OP2_SETGT_UINT;
1178 case AF_CC_GE: return ALU_OP2_SETGE_UINT;
1179 }
1180 break;
1181 }
1182 }
1183 }
1184
1185 assert(!"unexpected cc&cmp_type combination");
1186 return ~0u;
1187 }
1188
1189 unsigned get_predsetcc_op(unsigned cc, unsigned cmp_type) {
1190
1191 switch(cmp_type) {
1192 case AF_FLOAT_CMP: {
1193 switch (cc) {
1194 case AF_CC_E: return ALU_OP2_PRED_SETE;
1195 case AF_CC_NE: return ALU_OP2_PRED_SETNE;
1196 case AF_CC_GT: return ALU_OP2_PRED_SETGT;
1197 case AF_CC_GE: return ALU_OP2_PRED_SETGE;
1198 }
1199 break;
1200 }
1201 case AF_INT_CMP: {
1202 switch (cc) {
1203 case AF_CC_E: return ALU_OP2_PRED_SETE_INT;
1204 case AF_CC_NE: return ALU_OP2_PRED_SETNE_INT;
1205 case AF_CC_GT: return ALU_OP2_PRED_SETGT_INT;
1206 case AF_CC_GE: return ALU_OP2_PRED_SETGE_INT;
1207 }
1208 break;
1209 }
1210 case AF_UINT_CMP: {
1211 switch (cc) {
1212 case AF_CC_E: return ALU_OP2_PRED_SETE_INT;
1213 case AF_CC_NE: return ALU_OP2_PRED_SETNE_INT;
1214 case AF_CC_GT: return ALU_OP2_PRED_SETGT_UINT;
1215 case AF_CC_GE: return ALU_OP2_PRED_SETGE_UINT;
1216 }
1217 break;
1218 }
1219 }
1220
1221 assert(!"unexpected cc&cmp_type combination");
1222 return ~0u;
1223 }
1224
1225 unsigned get_killcc_op(unsigned cc, unsigned cmp_type) {
1226
1227 switch(cmp_type) {
1228 case AF_FLOAT_CMP: {
1229 switch (cc) {
1230 case AF_CC_E: return ALU_OP2_KILLE;
1231 case AF_CC_NE: return ALU_OP2_KILLNE;
1232 case AF_CC_GT: return ALU_OP2_KILLGT;
1233 case AF_CC_GE: return ALU_OP2_KILLGE;
1234 }
1235 break;
1236 }
1237 case AF_INT_CMP: {
1238 switch (cc) {
1239 case AF_CC_E: return ALU_OP2_KILLE_INT;
1240 case AF_CC_NE: return ALU_OP2_KILLNE_INT;
1241 case AF_CC_GT: return ALU_OP2_KILLGT_INT;
1242 case AF_CC_GE: return ALU_OP2_KILLGE_INT;
1243 }
1244 break;
1245 }
1246 case AF_UINT_CMP: {
1247 switch (cc) {
1248 case AF_CC_E: return ALU_OP2_KILLE_INT;
1249 case AF_CC_NE: return ALU_OP2_KILLNE_INT;
1250 case AF_CC_GT: return ALU_OP2_KILLGT_UINT;
1251 case AF_CC_GE: return ALU_OP2_KILLGE_UINT;
1252 }
1253 break;
1254 }
1255 }
1256
1257 assert(!"unexpected cc&cmp_type combination");
1258 return ~0u;
1259 }
1260
1261 unsigned get_cndcc_op(unsigned cc, unsigned cmp_type) {
1262
1263 switch(cmp_type) {
1264 case AF_FLOAT_CMP: {
1265 switch (cc) {
1266 case AF_CC_E: return ALU_OP3_CNDE;
1267 case AF_CC_GT: return ALU_OP3_CNDGT;
1268 case AF_CC_GE: return ALU_OP3_CNDGE;
1269 }
1270 break;
1271 }
1272 case AF_INT_CMP: {
1273 switch (cc) {
1274 case AF_CC_E: return ALU_OP3_CNDE_INT;
1275 case AF_CC_GT: return ALU_OP3_CNDGT_INT;
1276 case AF_CC_GE: return ALU_OP3_CNDGE_INT;
1277 }
1278 break;
1279 }
1280 }
1281
1282 assert(!"unexpected cc&cmp_type combination");
1283 return ~0u;
1284 }
1285
1286
1287 void convert_predset_to_set(shader& sh, alu_node* a) {
1288
1289 unsigned flags = a->bc.op_ptr->flags;
1290 unsigned cc = flags & AF_CC_MASK;
1291 unsigned cmp_type = flags & AF_CMP_TYPE_MASK;
1292
1293 bool swap_args = false;
1294
1295 cc = invert_setcc_condition(cc, swap_args);
1296
1297 unsigned newop = get_setcc_op(cc, cmp_type, true);
1298
1299 a->dst.resize(1);
1300 a->bc.set_op(newop);
1301
1302 if (swap_args) {
1303 std::swap(a->src[0], a->src[1]);
1304 std::swap(a->bc.src[0], a->bc.src[1]);
1305 }
1306
1307 a->bc.update_exec_mask = 0;
1308 a->bc.update_pred = 0;
1309 }
1310
1311 } // namespace r600_sb