Merge remote-tracking branch 'public/master' into vulkan
[mesa.git] / src / compiler / nir / nir_opt_algebraic.py
1 #! /usr/bin/env python
2 # -*- encoding: utf-8 -*-
3 #
4 # Copyright (C) 2014 Intel Corporation
5 #
6 # Permission is hereby granted, free of charge, to any person obtaining a
7 # copy of this software and associated documentation files (the "Software"),
8 # to deal in the Software without restriction, including without limitation
9 # the rights to use, copy, modify, merge, publish, distribute, sublicense,
10 # and/or sell copies of the Software, and to permit persons to whom the
11 # Software is furnished to do so, subject to the following conditions:
12 #
13 # The above copyright notice and this permission notice (including the next
14 # paragraph) shall be included in all copies or substantial portions of the
15 # Software.
16 #
17 # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18 # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19 # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
20 # THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21 # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
22 # FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
23 # IN THE SOFTWARE.
24 #
25 # Authors:
26 # Jason Ekstrand (jason@jlekstrand.net)
27
28 import nir_algebraic
29
30 # Convenience variables
31 a = 'a'
32 b = 'b'
33 c = 'c'
34 d = 'd'
35
36 # Written in the form (<search>, <replace>) where <search> is an expression
37 # and <replace> is either an expression or a value. An expression is
38 # defined as a tuple of the form ([~]<op>, <src0>, <src1>, <src2>, <src3>)
39 # where each source is either an expression or a value. A value can be
40 # either a numeric constant or a string representing a variable name.
41 #
42 # If the opcode in a search expression is prefixed by a '~' character, this
43 # indicates that the operation is inexact. Such operations will only get
44 # applied to SSA values that do not have the exact bit set. This should be
45 # used by by any optimizations that are not bit-for-bit exact. It should not,
46 # however, be used for backend-requested lowering operations as those need to
47 # happen regardless of precision.
48 #
49 # Variable names are specified as "[#]name[@type]" where "#" inicates that
50 # the given variable will only match constants and the type indicates that
51 # the given variable will only match values from ALU instructions with the
52 # given output type.
53 #
54 # For constants, you have to be careful to make sure that it is the right
55 # type because python is unaware of the source and destination types of the
56 # opcodes.
57
58 optimizations = [
59 (('fneg', ('fneg', a)), a),
60 (('ineg', ('ineg', a)), a),
61 (('fabs', ('fabs', a)), ('fabs', a)),
62 (('fabs', ('fneg', a)), ('fabs', a)),
63 (('iabs', ('iabs', a)), ('iabs', a)),
64 (('iabs', ('ineg', a)), ('iabs', a)),
65 (('~fadd', a, 0.0), a),
66 (('iadd', a, 0), a),
67 (('usadd_4x8', a, 0), a),
68 (('usadd_4x8', a, ~0), ~0),
69 (('~fadd', ('fmul', a, b), ('fmul', a, c)), ('fmul', a, ('fadd', b, c))),
70 (('iadd', ('imul', a, b), ('imul', a, c)), ('imul', a, ('iadd', b, c))),
71 (('~fadd', ('fneg', a), a), 0.0),
72 (('iadd', ('ineg', a), a), 0),
73 (('iadd', ('ineg', a), ('iadd', a, b)), b),
74 (('iadd', a, ('iadd', ('ineg', a), b)), b),
75 (('~fadd', ('fneg', a), ('fadd', a, b)), b),
76 (('~fadd', a, ('fadd', ('fneg', a), b)), b),
77 (('~fmul', a, 0.0), 0.0),
78 (('imul', a, 0), 0),
79 (('umul_unorm_4x8', a, 0), 0),
80 (('umul_unorm_4x8', a, ~0), a),
81 (('fmul', a, 1.0), a),
82 (('imul', a, 1), a),
83 (('fmul', a, -1.0), ('fneg', a)),
84 (('imul', a, -1), ('ineg', a)),
85 (('~ffma', 0.0, a, b), b),
86 (('~ffma', a, 0.0, b), b),
87 (('~ffma', a, b, 0.0), ('fmul', a, b)),
88 (('ffma', a, 1.0, b), ('fadd', a, b)),
89 (('ffma', 1.0, a, b), ('fadd', a, b)),
90 (('~flrp', a, b, 0.0), a),
91 (('~flrp', a, b, 1.0), b),
92 (('~flrp', a, a, b), a),
93 (('~flrp', 0.0, a, b), ('fmul', a, b)),
94 (('~flrp', a, b, ('b2f', c)), ('bcsel', c, b, a), 'options->lower_flrp'),
95 (('flrp', a, b, c), ('fadd', ('fmul', c, ('fsub', b, a)), a), 'options->lower_flrp'),
96 (('ffract', a), ('fsub', a, ('ffloor', a)), 'options->lower_ffract'),
97 (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', ('b2f', c)))), ('fmul', b, ('b2f', c))), ('bcsel', c, b, a), 'options->lower_flrp'),
98 (('~fadd', ('fmul', a, ('fadd', 1.0, ('fneg', c ))), ('fmul', b, c )), ('flrp', a, b, c), '!options->lower_flrp'),
99 (('~fadd', a, ('fmul', ('b2f', c), ('fadd', b, ('fneg', a)))), ('bcsel', c, b, a), 'options->lower_flrp'),
100 (('~fadd', a, ('fmul', c , ('fadd', b, ('fneg', a)))), ('flrp', a, b, c), '!options->lower_flrp'),
101 (('ffma', a, b, c), ('fadd', ('fmul', a, b), c), 'options->lower_ffma'),
102 (('~fadd', ('fmul', a, b), c), ('ffma', a, b, c), '!options->lower_ffma'),
103 # Comparison simplifications
104 (('~inot', ('flt', a, b)), ('fge', a, b)),
105 (('~inot', ('fge', a, b)), ('flt', a, b)),
106 (('~inot', ('feq', a, b)), ('fne', a, b)),
107 (('~inot', ('fne', a, b)), ('feq', a, b)),
108 (('inot', ('ilt', a, b)), ('ige', a, b)),
109 (('inot', ('ige', a, b)), ('ilt', a, b)),
110 (('inot', ('ieq', a, b)), ('ine', a, b)),
111 (('inot', ('ine', a, b)), ('ieq', a, b)),
112
113 # 0.0 >= b2f(a)
114 # b2f(a) <= 0.0
115 # b2f(a) == 0.0 because b2f(a) can only be 0 or 1
116 # inot(a)
117 (('fge', 0.0, ('b2f', a)), ('inot', a)),
118
119 # 0.0 < fabs(a)
120 # fabs(a) > 0.0
121 # fabs(a) != 0.0 because fabs(a) must be >= 0
122 # a != 0.0
123 (('flt', 0.0, ('fabs', a)), ('fne', a, 0.0)),
124
125 (('fge', ('fneg', ('fabs', a)), 0.0), ('feq', a, 0.0)),
126 (('bcsel', ('flt', b, a), b, a), ('fmin', a, b)),
127 (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
128 (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
129 (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
130 (('bcsel', a, True, 'b@bool'), ('ior', a, b)),
131 (('fmin', a, a), a),
132 (('fmax', a, a), a),
133 (('imin', a, a), a),
134 (('imax', a, a), a),
135 (('umin', a, a), a),
136 (('umax', a, a), a),
137 (('~fmin', ('fmax', a, 0.0), 1.0), ('fsat', a), '!options->lower_fsat'),
138 (('~fmax', ('fmin', a, 1.0), 0.0), ('fsat', a), '!options->lower_fsat'),
139 (('fsat', a), ('fmin', ('fmax', a, 0.0), 1.0), 'options->lower_fsat'),
140 (('fsat', ('fsat', a)), ('fsat', a)),
141 (('fmin', ('fmax', ('fmin', ('fmax', a, 0.0), 1.0), 0.0), 1.0), ('fmin', ('fmax', a, 0.0), 1.0)),
142 (('~ior', ('flt', a, b), ('flt', a, c)), ('flt', a, ('fmax', b, c))),
143 (('~ior', ('flt', a, c), ('flt', b, c)), ('flt', ('fmin', a, b), c)),
144 (('~ior', ('fge', a, b), ('fge', a, c)), ('fge', a, ('fmin', b, c))),
145 (('~ior', ('fge', a, c), ('fge', b, c)), ('fge', ('fmax', a, b), c)),
146 (('fabs', ('slt', a, b)), ('slt', a, b)),
147 (('fabs', ('sge', a, b)), ('sge', a, b)),
148 (('fabs', ('seq', a, b)), ('seq', a, b)),
149 (('fabs', ('sne', a, b)), ('sne', a, b)),
150 (('slt', a, b), ('b2f', ('flt', a, b)), 'options->lower_scmp'),
151 (('sge', a, b), ('b2f', ('fge', a, b)), 'options->lower_scmp'),
152 (('seq', a, b), ('b2f', ('feq', a, b)), 'options->lower_scmp'),
153 (('sne', a, b), ('b2f', ('fne', a, b)), 'options->lower_scmp'),
154 (('fne', ('fneg', a), a), ('fne', a, 0.0)),
155 (('feq', ('fneg', a), a), ('feq', a, 0.0)),
156 # Emulating booleans
157 (('imul', ('b2i', a), ('b2i', b)), ('b2i', ('iand', a, b))),
158 (('fmul', ('b2f', a), ('b2f', b)), ('b2f', ('iand', a, b))),
159 (('fsat', ('fadd', ('b2f', a), ('b2f', b))), ('b2f', ('ior', a, b))),
160 (('iand', 'a@bool', 1.0), ('b2f', a)),
161 (('flt', ('fneg', ('b2f', a)), 0), a), # Generated by TGSI KILL_IF.
162 (('flt', ('fsub', 0.0, ('b2f', a)), 0), a), # Generated by TGSI KILL_IF.
163 # Comparison with the same args. Note that these are not done for
164 # the float versions because NaN always returns false on float
165 # inequalities.
166 (('ilt', a, a), False),
167 (('ige', a, a), True),
168 (('ieq', a, a), True),
169 (('ine', a, a), False),
170 (('ult', a, a), False),
171 (('uge', a, a), True),
172 # Logical and bit operations
173 (('fand', a, 0.0), 0.0),
174 (('iand', a, a), a),
175 (('iand', a, ~0), a),
176 (('iand', a, 0), 0),
177 (('ior', a, a), a),
178 (('ior', a, 0), a),
179 (('fxor', a, a), 0.0),
180 (('ixor', a, a), 0),
181 (('ixor', a, 0), a),
182 (('inot', ('inot', a)), a),
183 # DeMorgan's Laws
184 (('iand', ('inot', a), ('inot', b)), ('inot', ('ior', a, b))),
185 (('ior', ('inot', a), ('inot', b)), ('inot', ('iand', a, b))),
186 # Shift optimizations
187 (('ishl', 0, a), 0),
188 (('ishl', a, 0), a),
189 (('ishr', 0, a), 0),
190 (('ishr', a, 0), a),
191 (('ushr', 0, a), 0),
192 (('ushr', a, 0), a),
193 (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
194 (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
195 # Exponential/logarithmic identities
196 (('~fexp2', ('flog2', a)), a), # 2^lg2(a) = a
197 (('~flog2', ('fexp2', a)), a), # lg2(2^a) = a
198 (('fpow', a, b), ('fexp2', ('fmul', ('flog2', a), b)), 'options->lower_fpow'), # a^b = 2^(lg2(a)*b)
199 (('~fexp2', ('fmul', ('flog2', a), b)), ('fpow', a, b), '!options->lower_fpow'), # 2^(lg2(a)*b) = a^b
200 (('~fexp2', ('fadd', ('fmul', ('flog2', a), b), ('fmul', ('flog2', c), d))),
201 ('~fmul', ('fpow', a, b), ('fpow', c, d)), '!options->lower_fpow'), # 2^(lg2(a) * b + lg2(c) + d) = a^b * c^d
202 (('~fpow', a, 1.0), a),
203 (('~fpow', a, 2.0), ('fmul', a, a)),
204 (('~fpow', a, 4.0), ('fmul', ('fmul', a, a), ('fmul', a, a))),
205 (('~fpow', 2.0, a), ('fexp2', a)),
206 (('~fpow', ('fpow', a, 2.2), 0.454545), a),
207 (('~fpow', ('fabs', ('fpow', a, 2.2)), 0.454545), ('fabs', a)),
208 (('~fsqrt', ('fexp2', a)), ('fexp2', ('fmul', 0.5, a))),
209 (('~frcp', ('fexp2', a)), ('fexp2', ('fneg', a))),
210 (('~frsq', ('fexp2', a)), ('fexp2', ('fmul', -0.5, a))),
211 (('~flog2', ('fsqrt', a)), ('fmul', 0.5, ('flog2', a))),
212 (('~flog2', ('frcp', a)), ('fneg', ('flog2', a))),
213 (('~flog2', ('frsq', a)), ('fmul', -0.5, ('flog2', a))),
214 (('~flog2', ('fpow', a, b)), ('fmul', b, ('flog2', a))),
215 (('~fadd', ('flog2', a), ('flog2', b)), ('flog2', ('fmul', a, b))),
216 (('~fadd', ('flog2', a), ('fneg', ('flog2', b))), ('flog2', ('fdiv', a, b))),
217 (('~fmul', ('fexp2', a), ('fexp2', b)), ('fexp2', ('fadd', a, b))),
218 # Division and reciprocal
219 (('~fdiv', 1.0, a), ('frcp', a)),
220 (('fdiv', a, b), ('fmul', a, ('frcp', b)), 'options->lower_fdiv'),
221 (('~frcp', ('frcp', a)), a),
222 (('~frcp', ('fsqrt', a)), ('frsq', a)),
223 (('fsqrt', a), ('frcp', ('frsq', a)), 'options->lower_fsqrt'),
224 (('~frcp', ('frsq', a)), ('fsqrt', a), '!options->lower_fsqrt'),
225 # Boolean simplifications
226 (('ieq', 'a@bool', True), a),
227 (('ine', 'a@bool', True), ('inot', a)),
228 (('ine', 'a@bool', False), a),
229 (('ieq', 'a@bool', False), ('inot', 'a')),
230 (('bcsel', a, True, False), ('ine', a, 0)),
231 (('bcsel', a, False, True), ('ieq', a, 0)),
232 (('bcsel', True, b, c), b),
233 (('bcsel', False, b, c), c),
234 # The result of this should be hit by constant propagation and, in the
235 # next round of opt_algebraic, get picked up by one of the above two.
236 (('bcsel', '#a', b, c), ('bcsel', ('ine', 'a', 0), b, c)),
237
238 (('bcsel', a, b, b), b),
239 (('fcsel', a, b, b), b),
240
241 # Conversions
242 (('i2b', ('b2i', a)), a),
243 (('f2i', ('ftrunc', a)), ('f2i', a)),
244 (('f2u', ('ftrunc', a)), ('f2u', a)),
245 (('i2b', ('ineg', a)), ('i2b', a)),
246 (('i2b', ('iabs', a)), ('i2b', a)),
247 (('fabs', ('b2f', a)), ('b2f', a)),
248 (('iabs', ('b2i', a)), ('b2i', a)),
249
250 # Byte extraction
251 (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
252 (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'),
253 (('iand', 0xff, ('ushr', a, 8)), ('extract_u8', a, 1), '!options->lower_extract_byte'),
254 (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
255
256 # Word extraction
257 (('ushr', a, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
258 (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
259
260 # Subtracts
261 (('~fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
262 (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
263 (('ussub_4x8', a, 0), a),
264 (('ussub_4x8', a, ~0), 0),
265 (('fsub', a, b), ('fadd', a, ('fneg', b)), 'options->lower_sub'),
266 (('isub', a, b), ('iadd', a, ('ineg', b)), 'options->lower_sub'),
267 (('fneg', a), ('fsub', 0.0, a), 'options->lower_negate'),
268 (('ineg', a), ('isub', 0, a), 'options->lower_negate'),
269 (('~fadd', a, ('fsub', 0.0, b)), ('fsub', a, b)),
270 (('iadd', a, ('isub', 0, b)), ('isub', a, b)),
271 (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
272 (('iabs', ('isub', 0, a)), ('iabs', a)),
273
274 # Propagate negation up multiplication chains
275 (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
276 (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
277
278 # Misc. lowering
279 (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
280 (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
281 (('uadd_carry', a, b), ('b2i', ('ult', ('iadd', a, b), a)), 'options->lower_uadd_carry'),
282 (('usub_borrow', a, b), ('b2i', ('ult', a, b)), 'options->lower_usub_borrow'),
283
284 (('bitfield_insert', 'base', 'insert', 'offset', 'bits'),
285 ('bcsel', ('ilt', 31, 'bits'), 'insert',
286 ('bfi', ('bfm', 'bits', 'offset'), 'insert', 'base')),
287 'options->lower_bitfield_insert'),
288
289 (('ibitfield_extract', 'value', 'offset', 'bits'),
290 ('bcsel', ('ilt', 31, 'bits'), 'value',
291 ('ibfe', 'value', 'offset', 'bits')),
292 'options->lower_bitfield_extract'),
293
294 (('ubitfield_extract', 'value', 'offset', 'bits'),
295 ('bcsel', ('ult', 31, 'bits'), 'value',
296 ('ubfe', 'value', 'offset', 'bits')),
297 'options->lower_bitfield_extract'),
298
299 (('extract_i8', a, b),
300 ('ishr', ('ishl', a, ('imul', ('isub', 3, b), 8)), 24),
301 'options->lower_extract_byte'),
302
303 (('extract_u8', a, b),
304 ('iand', ('ushr', a, ('imul', b, 8)), 0xff),
305 'options->lower_extract_byte'),
306
307 (('extract_i16', a, b),
308 ('ishr', ('ishl', a, ('imul', ('isub', 1, b), 16)), 16),
309 'options->lower_extract_word'),
310
311 (('extract_u16', a, b),
312 ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
313 'options->lower_extract_word'),
314
315 (('pack_unorm_2x16', 'v'),
316 ('pack_uvec2_to_uint',
317 ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
318 'options->lower_pack_unorm_2x16'),
319
320 (('pack_unorm_4x8', 'v'),
321 ('pack_uvec4_to_uint',
322 ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
323 'options->lower_pack_unorm_4x8'),
324
325 (('pack_snorm_2x16', 'v'),
326 ('pack_uvec2_to_uint',
327 ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
328 'options->lower_pack_snorm_2x16'),
329
330 (('pack_snorm_4x8', 'v'),
331 ('pack_uvec4_to_uint',
332 ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
333 'options->lower_pack_snorm_4x8'),
334
335 (('unpack_unorm_2x16', 'v'),
336 ('fdiv', ('u2f', ('vec2', ('extract_u16', 'v', 0),
337 ('extract_u16', 'v', 1))),
338 65535.0),
339 'options->lower_unpack_unorm_2x16'),
340
341 (('unpack_unorm_4x8', 'v'),
342 ('fdiv', ('u2f', ('vec4', ('extract_u8', 'v', 0),
343 ('extract_u8', 'v', 1),
344 ('extract_u8', 'v', 2),
345 ('extract_u8', 'v', 3))),
346 255.0),
347 'options->lower_unpack_unorm_4x8'),
348
349 (('unpack_snorm_2x16', 'v'),
350 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec2', ('extract_i16', 'v', 0),
351 ('extract_i16', 'v', 1))),
352 32767.0))),
353 'options->lower_unpack_snorm_2x16'),
354
355 (('unpack_snorm_4x8', 'v'),
356 ('fmin', 1.0, ('fmax', -1.0, ('fdiv', ('i2f', ('vec4', ('extract_i8', 'v', 0),
357 ('extract_i8', 'v', 1),
358 ('extract_i8', 'v', 2),
359 ('extract_i8', 'v', 3))),
360 127.0))),
361 'options->lower_unpack_snorm_4x8'),
362 ]
363
364 def fexp2i(exp):
365 # We assume that exp is already in range.
366 return ('ishl', ('iadd', exp, 127), 23)
367
368 def ldexp32(f, exp):
369 # First, we clamp exp to a reasonable range. The maximum range that we
370 # need is the largest range for an exponent, ([-127, 128] if you include
371 # inf and 0) plus the number of mantissa bits in either direction to
372 # account for denormals. This means that we need at least a range of
373 # [-150, 151]. For our implementation, however, what we really care
374 # about is that neither exp/2 nor exp-exp/2 go out of the regular range
375 # for floating-point exponents.
376 exp = ('imin', ('imax', exp, -252), 254)
377
378 # Now we compute two powers of 2, one for exp/2 and one for exp-exp/2.
379 # While the spec technically defines ldexp as f * 2.0^exp, simply
380 # multiplying once doesn't work when denormals are involved because
381 # 2.0^exp may not be representable even though ldexp(f, exp) is (see
382 # comments above about range). Instead, we create two powers of two and
383 # multiply by them each in turn. That way the effective range of our
384 # exponent is doubled.
385 pow2_1 = fexp2i(('ishr', exp, 1))
386 pow2_2 = fexp2i(('isub', exp, ('ishr', exp, 1)))
387 return ('fmul', ('fmul', f, pow2_1), pow2_2)
388
389 optimizations += [(('ldexp', 'x', 'exp'), ldexp32('x', 'exp'))]
390
391 # Unreal Engine 4 demo applications open-codes bitfieldReverse()
392 def bitfield_reverse(u):
393 step1 = ('ior', ('ishl', u, 16), ('ushr', u, 16))
394 step2 = ('ior', ('ishl', ('iand', step1, 0x00ff00ff), 8), ('ushr', ('iand', step1, 0xff00ff00), 8))
395 step3 = ('ior', ('ishl', ('iand', step2, 0x0f0f0f0f), 4), ('ushr', ('iand', step2, 0xf0f0f0f0), 4))
396 step4 = ('ior', ('ishl', ('iand', step3, 0x33333333), 2), ('ushr', ('iand', step3, 0xcccccccc), 2))
397 step5 = ('ior', ('ishl', ('iand', step4, 0x55555555), 1), ('ushr', ('iand', step4, 0xaaaaaaaa), 1))
398
399 return step5
400
401 optimizations += [(bitfield_reverse('x'), ('bitfield_reverse', 'x'))]
402
403
404 # Add optimizations to handle the case where the result of a ternary is
405 # compared to a constant. This way we can take things like
406 #
407 # (a ? 0 : 1) > 0
408 #
409 # and turn it into
410 #
411 # a ? (0 > 0) : (1 > 0)
412 #
413 # which constant folding will eat for lunch. The resulting ternary will
414 # further get cleaned up by the boolean reductions above and we will be
415 # left with just the original variable "a".
416 for op in ['flt', 'fge', 'feq', 'fne',
417 'ilt', 'ige', 'ieq', 'ine', 'ult', 'uge']:
418 optimizations += [
419 ((op, ('bcsel', 'a', '#b', '#c'), '#d'),
420 ('bcsel', 'a', (op, 'b', 'd'), (op, 'c', 'd'))),
421 ((op, '#d', ('bcsel', a, '#b', '#c')),
422 ('bcsel', 'a', (op, 'd', 'b'), (op, 'd', 'c'))),
423 ]
424
425 # This section contains "late" optimizations that should be run after the
426 # regular optimizations have finished. Optimizations should go here if
427 # they help code generation but do not necessarily produce code that is
428 # more easily optimizable.
429 late_optimizations = [
430 # Most of these optimizations aren't quite safe when you get infinity or
431 # Nan involved but the first one should be fine.
432 (('flt', ('fadd', a, b), 0.0), ('flt', a, ('fneg', b))),
433 (('~fge', ('fadd', a, b), 0.0), ('fge', a, ('fneg', b))),
434 (('~feq', ('fadd', a, b), 0.0), ('feq', a, ('fneg', b))),
435 (('~fne', ('fadd', a, b), 0.0), ('fne', a, ('fneg', b))),
436
437 (('fdot2', a, b), ('fdot_replicated2', a, b), 'options->fdot_replicates'),
438 (('fdot3', a, b), ('fdot_replicated3', a, b), 'options->fdot_replicates'),
439 (('fdot4', a, b), ('fdot_replicated4', a, b), 'options->fdot_replicates'),
440 (('fdph', a, b), ('fdph_replicated', a, b), 'options->fdot_replicates'),
441 ]
442
443 print nir_algebraic.AlgebraicPass("nir_opt_algebraic", optimizations).render()
444 print nir_algebraic.AlgebraicPass("nir_opt_algebraic_late",
445 late_optimizations).render()