From 76289fbfa84a06ef4db8ad44ea0eb88ad0be8d5c Mon Sep 17 00:00:00 2001
From: Matt Turner <mattst88@gmail.com>
Date: Thu, 21 Jan 2016 09:09:48 -0800
Subject: [PATCH] nir: Recognize open-coded extract_u8.

Two shaders that appear in Unigine benchmarks (Heaven and Valley) unpack
three bytes from an integer and convert each into a float:

   float((val >> 16u) & 0xffu)
   float((val >>  8u) & 0xffu)
   float((val >>  0u) & 0xffu)

Instead of shifting, masking, and type converting like this:

   shr(8)          g15<1>UD        g25<8,8,1>UD    0x00000010UD
   and(8)          g16<1>UD        g15<8,8,1>UD    0x000000ffUD
   mov(8)          g17<1>F         g16<8,8,1>UD

   shr(8)          g18<1>UD        g25<8,8,1>UD    0x00000008UD
   and(8)          g19<1>UD        g18<8,8,1>UD    0x000000ffUD
   mov(8)          g20<1>F         g19<8,8,1>UD

   and(8)          g21<1>UD        g25<8,8,1>UD    0x000000ffUD
   mov(8)          g22<1>F         g21<8,8,1>UD

i965 can simply extract a byte and convert to float in a single
instruction:

   mov(8)          g17<1>F         g25.2<32,8,4>UB
   mov(8)          g20<1>F         g25.1<32,8,4>UB
   mov(8)          g22<1>F         g25.0<32,8,4>UB

This patch implements the first step: recognizing byte extraction. A
later patch will optimize out the conversion to float.

   instructions in affected programs: 28568 -> 27450 (-3.91%)
   helped: 7

   cycles in affected programs: 210076 -> 203144 (-3.30%)
   helped: 7

This patch decreases the number of instructions in the two Unigine
programs by:

 #1721: 4520 -> 4374 instructions (-3.23%)
 #1706: 3752 -> 3582 instructions (-4.53%)

Reviewed-by: Kenneth Graunke <kenneth@whitecape.org>
Reviewed-by: Iago Toral Quiroga <itoral@igalia.com>
---
 src/compiler/nir/nir_opt_algebraic.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index cc2c2299ab9..dd26805e1a8 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -162,6 +162,7 @@ optimizations = [
    (('ishr', a, 0), a),
    (('ushr', 0, a), 0),
    (('ushr', a, 0), a),
+   (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
    # Exponential/logarithmic identities
    (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a
    (('flog2', ('fexp2', a)), a), # lg2(2^a) = a
@@ -213,6 +214,12 @@ optimizations = [
    (('f2i', ('ftrunc', a)), ('f2i', a)),
    (('f2u', ('ftrunc', a)), ('f2u', a)),
 
+   # Byte extraction
+   (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
+   (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'),
+   (('iand', 0xff, ('ushr', a,  8)), ('extract_u8', a, 1), '!options->lower_extract_byte'),
+   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+
    # Subtracts
    (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
    (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
-- 
2.30.2